library(meffil)
#options(mc.cores=1)


library(tidyverse)
library(readxl)
library(rebus)
library(ewastools)
library(kableExtra)
library(gtools)
library(rlist)
library(broom)
library(lmtest)
library(sandwich)
library(scattermore)
library(janitor)
library(PCAtools)
library(corrplot)
library(GGally)
library(prediction)
library(ggVennDiagram)
library(ggpubr)
library(performance)
library(sesame)
library(parallel)
library(doParallel)
library(ComplexHeatmap)
library(circlize)
library(betareg)
library(table1)
library(DT)
library(DNAmArray)
library(DiagrammeR)
library(rebus)
library(data.table)



mutate <- dplyr::mutate 
select <- dplyr::select
rename <- dplyr::rename
or <- rebus::or



library(here)




Read in data

Important note:

Samples marked for dropping will be dropped just after ComBat for EWAS analyses and filtered after uploading to the clock website for epigenetic age analyses.




# write.csv(df5, file = "congo_ewas_pheno_20221120.csv",
#          row.names = FALSE)

df5 <- read.csv(file = here("data","congo_ewas_pheno_20221120.csv"))

# Get a vector of samples that correspond to sample mix-ups, accidental
# duplicates collected at different times,
# siblings to remove, etc. These are in the "drop_sample" column. These
# were identified in initial sample quality control.

droppers <- df5 %>% 
  filter(drop_sample == TRUE) %>% 
  pull(methylation_id)

# Dropped a total of 140 samples there.496-140 = 356 idat files

cat("We have",nrow(df5),"pairs of idat files and",
    length(unique(paste0(df5$dyad,df5$num,df5$tissue))) - 8,
    "unique individuals.")
## We have 356 pairs of idat files and 324 unique individuals.

perform QC

bdf <- df5 %>% 
  filter(tissue == c("baby_venous_blood")) %>% 
  rename(mom_age = "age") %>% 
  mutate(Age = 0) 

mdf <- df5 %>% 
  filter(tissue == c("mother_venous_blood")) %>% 
  mutate(Sex = c("F")) %>% 
  mutate(Age = age)

# 177 babies and 179 mothers.

# meffil.list.cell.type.references()

# [1] "andrews and bakulski cord blood" "blood gse35069"                 
#  [3] "blood gse35069 chen"             "blood gse35069 complete"        
#  [5] "blood idoloptimized"             "blood idoloptimized epic"       
#  [7] "combined cord blood"             "cord blood gse68456"            
#  [9] "gervin and lyle cord blood"      "guintivano dlpfc"               
# [11] "saliva gse48472"

if(file.exists(here("output","qc.objects.baby.Robj"))) {
  
  load(here("output","qc.objects.baby.Robj"))
  
} else {

qc.objects.baby <- meffil.qc(bdf, cell.type.reference="combined cord blood", verbose=TRUE)
save(qc.objects.baby,file = here("output","qc.objects.baby.Robj"))

}


if(file.exists(here("output","qc.objects.mother.Robj"))) {
  
  load(here("output","qc.objects.mother.Robj"))
  
} else {


qc.objects.mother <- meffil.qc(mdf, cell.type.reference="blood idoloptimized epic", verbose=TRUE)
save(qc.objects.mother,file=here("output","qc.objects.mother.Robj"))

}


 qc.parameters <- meffil.qc.parameters(
    beadnum.samples.threshold             = 0.1,
    detectionp.samples.threshold          = 0.1,
    detectionp.cpgs.threshold             = 0.1,
    beadnum.cpgs.threshold                = 0.1,
    sex.outlier.sd                        = 5
)

 
# Get quality control summary reports
 
if(file.exists(here("output","qcsummary.baby.Robj"))) {
  
  load(here("output","qcsummary.baby.Robj"))
  
} else {
 
 qc.summary.baby <- meffil.qc.summary(
    qc.objects.baby,
    parameters = qc.parameters
 )

 save(qc.summary.baby, file = here("output","qcsummary.baby.Robj"))

 meffil.qc.report(qc.summary.baby, 
                  output.file = here("output","qc.report.baby.html"))

}

 
if(file.exists(here("output","qcsummary.mother.Robj"))){
  
  load(here("output","qcsummary.mother.Robj"))
  
} else {
 
 qc.summary.mother <- meffil.qc.summary(
    qc.objects.mother,
    parameters = qc.parameters
 )

save(qc.summary.mother, file = here("output","qcsummary.mother.Robj"))

meffil.qc.report(qc.summary.mother, 
                 output.file = here("output","qc.report.mother.html"))

}

# List the quality control outliers


# for babies:
outlier.baby <- qc.summary.baby$bad.samples
table(outlier.baby$issue)
## 
## Control probe (extension.G.12719506) Control probe (extension.G.74666473) 
##                                    1                                    1 
## Control probe (extension.R.21752326) Control probe (extension.R.63642461) 
##                                    1                                    1 
##        Control probe (hybe.21771417)        Control probe (hybe.39782321) 
##                                    2                                    1 
##   Control probe (nonpoly.G.23663352)   Control probe (nonpoly.G.38796356) 
##                                    1                                    1 
##                Control probe (normC)                Control probe (normG) 
##                                    1                                    1 
##            Control probe (oob.G.99%)     Control probe (spec1.G.51804467) 
##                                    1                                    1 
##          Control probe (spec1.ratio)         Control probe (spec1.ratio1) 
##                                    1                                    1 
##         Control probe (spec1.ratio2)          Control probe (spec2.ratio) 
##                                    1                                    1 
##                    Detection p-value           Methylated vs Unmethylated 
##                                    1                                    2 
##                         Sex mismatch                    X-Y ratio outlier 
##                                    5                                    1
index.baby <- outlier.baby$issue %in% c("Control probe (dye.bias)", 
                              "Methylated vs Unmethylated",
                              "X-Y ratio outlier",
                              "Low bead numbers",
                              "Detection p-value",
                              "Sex mismatch",
                              "Control probe (bisulfite1)",
                              "Control probe (bisulfite2)")

outlier.baby <- outlier.baby[index.baby,]



# For mothers:
outlier.mother <- qc.summary.mother$bad.samples
table(outlier.mother$issue)
## 
## Control probe (hybe.21771417)   Control probe (spec1.ratio) 
##                             1                             1 
##  Control probe (spec1.ratio1)    Methylated vs Unmethylated 
##                             1                             3 
##                  Sex mismatch 
##                             1
index.mother <- outlier.mother$issue %in% c("Control probe (dye.bias)", 
                              "Methylated vs Unmethylated",
                              "X-Y ratio outlier",
                              "Low bead numbers",
                              "Detection p-value",
                              "Sex mismatch",
                              "Control probe (bisulfite1)",
                              "Control probe (bisulfite2)")

outlier.mother <- outlier.mother[index.mother,]

extract cell types

cc.baby <- t(sapply(qc.objects.baby, function(obj) obj$cell.counts$counts))
cc.baby <- data.frame(IID=row.names(cc.baby),cc.baby)
write.csv(cc.baby, file = here("output","congo_baby_cell_composition_20220613.csv"))

cc.mother <- t(sapply(qc.objects.mother, function(obj) obj$cell.counts$counts))
cc.mother <- data.frame(IID=row.names(cc.mother),cc.mother)
write.csv(cc.mother, file = here("output","congo_mother_cell_composition_20220613.csv"))

Contamination check

Use ewastools package for this.

if(file.exists(here("output","snps.RDS"))){
  
  snps <- readRDS(file = here("output","snps.RDS"))
  
} else {

meth <- read_idats(df5$Basename,quiet = FALSE) # This includes all mother and
# baby samples at birth of the correct tissue type.

detectionP <- ewastools::detectionP
mask <- ewastools::mask
correct_dye_bias <- ewastools::correct_dye_bias
dont_normalize <- ewastools::dont_normalize

# get betas
beta <- meth %>% detectionP %>% mask(0.01) %>% correct_dye_bias %>% dont_normalize

# get snps
snps <- meth$manifest[probe_type=="rs",index]
snps <- beta[snps,]

saveRDS(snps, file = here("output","snps.RDS"))

rm(meth)
rm(beta)

}


genotypes <- call_genotypes(snps,learn=FALSE)

df5$outlier <- snp_outliers(genotypes)

contam <- df5[df5$outlier > -4,c("Sample_Name","outlier")]

contam %>% kbl(caption="Samples identified as contaminated by ewastools") %>% kable_styling("hover",full_width=F)
Samples identified as contaminated by ewastools
Sample_Name outlier
260 SV001b.b.1.2 -3.463359
293 SV018M.1.1 -3.904566

SeSame noob

Get noob corrected betas for the epigenetic clock.

if(file.exists(here("data", "congo_mothers_and_babies_noob_betas.rds"))) {

noobBetas <- readRDS(file = here("data", "congo_mothers_and_babies_noob_betas.rds"))

} else {
 
 cl <- makeCluster(16)
 registerDoParallel(cl)
 clusterExport(cl, c("readIDATpair","noob","getBetas"))


 noobBetas <- do.call(cbind, parLapply(cl,df5$Basename, function(pfx) {
    getBetas(noob(readIDATpair(pfx)), mask = F)
 }))

 stopCluster(cl)

 colnames(noobBetas) <- df5$methylation_id

 saveRDS(noobBetas, file = here("data","congo_mothers_and_babies_noob_betas.rds"))

}

prepare data for epigenetic clock

df6 <- df5 %>% 
  mutate(Age = ifelse(tissue == "baby_venous_blood",0,age)) %>% 
  mutate(Female = ifelse(tissue == "mother_venous_blood",1,Female))


clock <- df6 %>% 
  select(methylation_id,
         slide_position,
         Age,
         Tissue,
         Female)




# Reformat the noob betas object obtained in the chunk above this one
# and filter them for the probes needed for the epigenetic clock.

noobBetas <- as.data.frame(noobBetas)
noobBetas$Name <- rownames(noobBetas)
probes <- read.csv(here("supp_data","datMiniAnnotation3.csv"))


# get betas for Horvath's probe list
fil.noob.betas <- merge(probes, noobBetas, by = "Name", all.x = TRUE, all.y = FALSE)
nrow(fil.noob.betas) # 30084 is what we need

fil.noob.betas[1:10,1:10]


# remove extra columns
fil.noob.betas <- fil.noob.betas[-c(2:7)]

colnames(fil.noob.betas)[1] <- "ProbeID"

cn <- colnames(fil.noob.betas[-1])

link <- match(cn,clock$methylation_id)

cc2 <- clock[link,]

identical(colnames(fil.noob.betas[-1]),cc2$methylation_id)

write.csv(cc2, 
          file = here("data","epigenetic_clock_samplesheet.csv"), 
          row.names = F)
write.csv(fil.noob.betas, 
          row.names = F, 
          file = here("data","epigenetic_clock_betas.csv"))

format epigenetic age data

# Read in output from Horvath's website
mage <- read.csv(here("data","epigenetic_clock_betas.output.csv"))

# Create a data frame of problematic samples by the epigenetic clock
# quality control checks

gs <- mage %>% 
  filter(corSampleVSgoldstandard < 0.80) %>% 
  select(SampleID)

clock_sex_checks <- mage %>% 
  mutate(problem_sex = case_when(
    predictedGender == "female" & Female == 1 ~ FALSE,
    predictedGender == "male" & Female == 0 ~ FALSE,
    predictedGender == "female" & Female == 0 ~ TRUE,
    predictedGender == "male" & Female == 1 ~ TRUE,
    predictedGender == "Unsure" ~ TRUE)) %>% 
  filter(problem_sex == TRUE)



# write function to average replicates

avgReps <- function(repsToAverage){
  
  dr <- mage[mage$methylation_id %in% repsToAverage,]
  
  row <- dr %>% 
    summarise_if(is.numeric, mean, na.rm = TRUE)
  
  x <- smartbind(dr,row,fill="replaceme")
  
  dex <- grep("replaceme",x[nrow(x), ])
  
  fillers <- x[nrow(x) - 1, dex]
  
  fin <- replace(x[nrow(x), ], dex, fillers)
  
  return(fin[nrow(fin),])
}

# Get replicate status and subset for all replicates.
# Use this to get list of which samples to average.


rav <- df6 %>% 
  select(methylation_id,replicate,dyad,tissue) %>% 
  filter(replicate == TRUE) %>% 
  inner_join(mage, by = c("methylation_id" = "methylation_id"))%>% 
  group_by(dyad,tissue) %>% 
  select(methylation_id) 

toAverage <- by(rav$methylation_id,rav$dyad,print)

# average by replicates and store the new
# rows in a data frame

rep1 <- avgReps(toAverage$C1)
rep2 <- avgReps(toAverage$C12)
rep3 <- avgReps(toAverage$C20)
rep4 <- avgReps(toAverage$C29)
rep5 <- avgReps(toAverage$C49[c(1,5)])
rep6 <- avgReps(toAverage$C49[c(2:4)])
rep7 <- avgReps(toAverage$C5)
rep8 <- avgReps(toAverage$C62)
rep9 <- avgReps(toAverage$C89[c(1,3)])
rep10 <- avgReps(toAverage$C89[c(2,4)])
rep11 <- avgReps(toAverage$C91)
rep12 <- avgReps(toAverage$SV11)
rep13 <- avgReps(toAverage$SV51)
rep14 <- avgReps(toAverage$SV61)
rep15 <- avgReps(toAverage$SV67)
rep16 <- avgReps(toAverage$SV7)
rep17 <- avgReps(toAverage$SV76)

averaged_reps <- bind_rows(rep1,rep2,rep3,rep4,rep5,rep6,
                           rep7,rep8,rep9,rep10,rep11,
                           rep12,rep13,rep14,rep15,rep16,rep17)



# remove replicates rows from original epigenetic age
# data frame and then
# Add back in the rows with the average replicate
# values. For character vectors, such as slide
# and position on slide, just selecting a single
# value because these cannot be averaged. That's
# fine as long as we're only using numeric variables.

mage2 <- mage %>% 
  filter(!(methylation_id %in% rav$methylation_id)) %>% 
  bind_rows(averaged_reps)

cat("There were",length(rav$methylation_id),"replicate samples total out of",
    nrow(mage),"samples in epigenetic age analyses, corresponding to",
    nrow(averaged_reps),"unique individuals for whom we had replicates samples.")
cat("These were removed from the data, averaged by replicate, and
    then added back into the data, leaving",
    nrow(mage2),"samples.")

# Samples failing QC checks should be removed
# at a later point, after considering all
# measures.



# Finally, create a wide data set so that
# each row represents a single dyad. Subset
# mothers and babies into separate data
# frames, and then column bind them adding
# suffixes to each column name according
# to which data frame that column is from.


mage3 <- df6 %>% 
  select(-Tissue,-Female,-Age,-slide_position) %>% 
  inner_join(mage2, by = c("methylation_id" = "methylation_id"))

write.csv(mage3, file = here("output","congo_epigenetic_clock_20220706.csv"))

mage3_baby <- mage3 %>% 
  filter(tissue == "baby_venous_blood")

mage3_mother <- mage3 %>% 
  filter(tissue == "mother_venous_blood")

setdiff(mage3_baby$dyad,mage3_mother$dyad)
setdiff(mage3_mother$dyad,mage3_baby$dyad)

mage4 <- mage3_baby %>% 
  full_join(mage3_mother, by = c("dyad" = "dyad"), suffix = c("_baby","_mother"))

write.csv(mage4, file = here("output","congo_epigenetic_clock_wide_20220706.csv"))

Sesame preprocessed betas

# Illumina's EPIC manifest

illumina <- read.csv(here("supp_data",
                          "infinium-methylationepic-v-1-0-b5-manifest-file.csv"))

illumina2 <- illumina %>% 
  select(IlmnID,UCSC_RefGene_Group) %>% 
  rename(probeID = IlmnID,gene_context = UCSC_RefGene_Group)

# Get the probes to mask

zhou1 <- read.table(file = here("supp_data","EPIC.hg38.manifest.pop.tsv"),
                    sep = '\t', header = TRUE)

# Get the cpg island context and distance to transcription start site columns
zhou2 <- read.table(file = here("supp_data","EPIC.hg38.manifest.gencode.v36.tsv"), sep = '\t', header = TRUE)

zhou3 <- read.table(file = here("supp_data","EPIC.hg38.manifest.tsv"),
                    sep = '\t', header = TRUE)

zhou <- zhou1 %>% 
  select(probeID,MASK_general_AFR) %>% 
  left_join(zhou3, by = c("probeID" = "probeID"))
  


# Get probes to mask for quality reasons plus
# high snp frequencies near the extension base
# for the African super population from 1000
# Genomes Project

mask <- zhou %>% 
  filter(MASK_general_AFR == TRUE) %>% 
  select(probeID)

#

#

if (file.exists(here("data","congo_mothers_and_babies_sesame_betas_20220630.rds"))){
  
  betas <- readRDS(here("data","congo_mothers_and_babies_sesame_betas_20220630.rds"))
  
} else {

cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("readIDATpair","noob","getBetas","dyeBiasNL",
                    "pOOBAH","getBetas","addMask","openSesame","mask",
                    "inferInfiniumIChannel", "prefixMaskButCG"))



betas <-  do.call(cbind,
    parLapply(cl,df6$Basename, function(pfx) {
        getBetas(
          noob(
          pOOBAH(
          dyeBiasNL(
          inferInfiniumIChannel(
          addMask(
            prefixMaskButCG(
            readIDATpair(pfx)), probes = mask$probeID))))))
}))


#
stopCluster(cl)
#
colnames(betas) <- df6$methylation_id

saveRDS(betas, file = here("data","congo_mothers_and_babies_sesame_betas_20220630.rds"))

}

Raw betas

if (file.exists(here("data","congo_mothers_and_babies_raw_betas_20220630.rds"))){
  
 raw_betas <- readRDS(here("data","congo_mothers_and_babies_raw_betas_20220630.rds"))
  
} else {

cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("readIDATpair","noob","getBetas","getBetas","addMask","mask",
                    "inferInfiniumIChannel","prefixMaskButCG"))



raw_betas <-  do.call(cbind,
    parLapply(cl,df6$Basename, function(pfx) {
        getBetas(
          addMask(
            prefixMaskButCG(
            readIDATpair(pfx)), probes = mask$probeID))
}))



stopCluster(cl)

colnames(raw_betas) <- df6$methylation_id

saveRDS(raw_betas, file = here("data","congo_mothers_and_babies_raw_betas_20220630.rds"))

}

Zero Intensity Probes

########################################################################
###                                                                  ###
###               REMOVING ZERO INTENSITY PROBES                     ###
###                                                                  ###
########################################################################

### From the DNAmArray R Package: https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_molepi_DNAmArray&d=DwIGAg&c=sJ6xIWYx-zLMB3EPkvcnVg&r=sWbu0rLU_L4ssDfYmHby1mWqmTG4ExdsuRnL2LYjxa8&m=zYM2gm7K2prO9Jt7dxhNG1vFdBYlyEwJ_EJaDAcASlI&s=socmXYQLInnwM7c0fOpTC5UoKzpeHv_0xg5tgJ8KYoY&e=


# This is all to get the zeroIntensityProbes. If you have them,
# load them. Otherwise start the chunk and get a coffee.

if (file.exists(here("output","zeroIntensityProbes.rds"))){
  
  zeroIntensityProbes <- readRDS(here("output","zeroIntensityProbes.rds"))
  
} else {
  
# Make a combined samplesheet to read into the targets function
  
batch1 <- read.csv(here("data","samplesheet_april2016.csv")) %>% 
  mutate(Sample_Plate = as.character(Sample_Plate))
batch2 <- read.csv(here("data","samplesheet_aug2016.csv"))
batch3 <- read.csv(here("data","samplesheet_jan2019.csv"))
batch4 <- read.csv(here("data","samplesheet_jan2021.csv"))
batch5 <- read.csv(here("data","samplesheet_june2016.csv"))
batch6 <- read.csv(here("data","samplesheet_march2021.csv"))

batches <- bind_rows(batch1,batch2,batch3,batch4,batch5,batch6) %>% 
  filter(Sample_Name %in% df6$core_id)

write.csv(batches,
          here("data","combined_samplesheet_zero_intensity.csv"),
          row.names = F)
  



# Generate targets object
targets <- read.metharray.sheet(here("data"),
                                pattern = c("combined_samplesheet_zero_intensity.csv"))


# get an "RGChannelSetExtended" object. They parallelize this
# in the vignette but I purposefully do not. Ironically, it
# breaks on the cluster, which is made for parallelization...

RGset2 <- read.metharray.exp(targets = targets,
                             verbose = FALSE, 
                             extended=TRUE,
                             force = TRUE)

# Read idats into a DNAmArray "RGChannelSetExtended" object
probeFiltering <- function(RGset, cutbead=3, zeroint=TRUE, verbose=TRUE){

  if(class(RGset) != "RGChannelSetExtended")
    stop("RGset should be of class 'RGChannelSetExtended' in order to perform filtering on number of beads!")

  ##Filter on number of beads
  if(verbose)
    cat("Filtering on number of beads... \n")

  beadmat <- getNBeads(RGset)

  idBeadmat <- beadmat < cutbead
  ##beadmat[idBeadmat] <- NA

  if(verbose)
    cat("On average", round(100*sum(idBeadmat)/prod(dim(idBeadmat)), 2),"% of the probes (",nrow(idBeadmat),") have number of beads below", cutbead, "\n")

  ##Filter on Red and Green intensity <1
  if(zeroint) {
    if(verbose)
      cat("Filtering on zero intensities... \n")

    Grn <- getGreen(RGset)
    Red <- getRed(RGset)

    ##determine if Grn and/or Red intensities of type II probes are <1
    idT2 <- Grn[getProbeInfo(RGset, type = "II")$AddressA,] < 1 | Red[getProbeInfo(RGset, type = "II")$AddressA,] < 1

    ##determine if either Grn or Red intensities of Type I probes are <1
    idT1Grn <- Grn[c(getProbeInfo(RGset, type = "I-Green")$AddressA,
                     getProbeInfo(RGset, type = "I-Green")$AddressB),] < 1

    idT1Red <- Red[c(getProbeInfo(RGset, type = "I-Red")$AddressA,
                     getProbeInfo(RGset, type = "I-Red")$AddressB),] < 1

    if(verbose) {
      cat("On average", round(100*sum(idT2)/prod(dim(idT2)), 3),"% of the Type II probes (",nrow(idT2),") have Red and/or Green intensity below 1 \n")
      cat("On average", round(100*sum(idT1Grn)/prod(dim(idT1Grn)), 3),"% of the Type I probes (",nrow(idT1Grn),"), measured in Green channel, have intensity below 1 \n")
      cat("On average", round(100*sum(idT1Red)/prod(dim(idT1Red)), 3),"% of the Type I probes (",nrow(idT1Red),"), measured in Red channel, have intensity below 1 \n")
    }
  }

  ##combine all filtered results and set NA in Red and/or Green channels
  Red[idBeadmat] <- Grn[idBeadmat] <- NA

  if(zeroint) {
    if(verbose){
      cat("Set filtered probes in Red and/or Green channels to NA... \n")
    }

    for(i in 1:ncol(RGset)) {
      if(verbose & i%%100 == 0)
        cat("... done ",i," out of ",ncol(RGset)," ... \n")
      idRed <- c(names(which(idT2[,i])), names(which(idT1Red[,i])))
      midRed <- match(idRed, rownames(Red))
      Red[midRed, i] <- NA
      idGrn <- c(names(which(idT2[,i])), names(which(idT1Grn[,i])))
      midGrn <- match(idGrn, rownames(Grn))
      Grn[midGrn, i] <- NA
    }
  }

  RGChannelSet(Green = Grn, Red = Red,
               colData = colData(RGset),
               annotation = annotation(RGset))
}

tempfilteringRGset <- probeFiltering(RGset2,cutbead=3,zeroint=TRUE)
rm(RGset2)
tempbetas2 <- getBeta(preprocessRaw(tempfilteringRGset))
rm(tempfilteringRGset)


# get tally of samples for which a given probe is NA
gna <- rowSums(is.na(tempbetas2))
gna2 <- cbind.data.frame(rownames(tempbetas2),gna)
# get probe names for probes that either had zero intensity or
# less than three beads in > 10% of all samples.
zeroIntensityProbes <- gna2[gna2$gna > ncol(tempbetas2)*0.1,]$`rownames(tempbetas2)`

length(zeroIntensityProbes) # 987 zero intensity probes.

saveRDS(zeroIntensityProbes, file = here("output","zeroIntensityProbes.rds"))

# clear up some space

rm(tempbetas2)
rm(gna)
rm(gna2)

}



Description of probe attrition up to this point and set zeroIntensity probes to NA in preprocessed betas objects.

# Probe attrition starts here, with the sum
# of probes that are marked NA by initial
# masking, which set "rs" and "ch" probes to NA
# and the probes indicated for masking for quality
# reasons, and the probes indicated for masking
# for allele frequencies of SNPs in the African
# super population:

table(apply(raw_betas, 1, function (x) sum(is.na(x))))

#     0    356  Number of Participants
# 739396   127157 Number of Probes set to NA

# Are any of these non-CpG probes identified as 
# zero intensity?

checker <- apply(raw_betas, 1, function (x) sum(is.na(x)))
checker <- names(checker[checker==ncol(raw_betas)])
table(checker %in% zeroIntensityProbes)

# Yes, 106 probes are overlapping. That means
# we have 987 - 106 probes that are uniquely masked
# for zero intensity reasons, minus a single probe
# in zero intensity object that is not in the rownames
# of the raw betas object: 987 - 106 - 1 = 880.



# How many rows are all NA for probes after preprocessing but
# before adding NA for zero intensity probes?

prena <- apply(betas, 1, function (x) sum(is.na(x)))

# run table(prena) to see that 128,238 probes are NA for everyone.
# 127,157 were initially masked because they were not CpG probes,
# but some other type like "rs" or "ch" probes, or they did not
# pass the two masking filters (cg probes only and no SNP problems
# with a probe for the African super population).

# 128,238 - 127,157 = 1081 probes failed in all samples upon 
# initial preprocessing.

betas[rownames(betas) %in% zeroIntensityProbes==TRUE,] <- NA

# how many probes are NA for everyone now?

postna <- apply(betas, 1, function (x) sum(is.na(x)))
table(postna)

probes_na <- names(postna[postna==ncol(betas)])
table(zeroIntensityProbes %in% probes_na)

# 129111 probes are marked NA, which indicates
# that of the 880 zeroIntensity probes that could
# uniquely added more NA probes, 7 of them overlapped
# with the probes that were set to NA during the initial
# preprocessing for every single sample.

# Step 1 probe attrition final counts:

# initial masking of bad probes, snp probes, and non cg probes:
# 127157
# Additional masking of probes in all samples by preprocessing:
# 1081
# Additional probes set to NA because of zeroIntensity, that were
# not already marked NA by initial masking or preprocessing:
# 873.

# Total probes masked for every single person at this stage = 129111.
# x is a matrix containing the data
# method : correlation method. "pearson"" or "spearman"" is supported
# removeTriangle : remove upper or lower triangle
# results :  if "html" or "latex"
  # the results will be displayed in html or latex format
corstars <-function(x, method=c("pearson", "spearman"), removeTriangle=c("upper", "lower"),
                     result=c("none", "html", "latex")){
    #Compute correlation matrix
    require(Hmisc)
    x <- as.matrix(x)
    correlation_matrix<-rcorr(x, type=method[1])
    R <- correlation_matrix$r # Matrix of correlation coeficients
    p <- correlation_matrix$P # Matrix of p-value 
    
    ## Define notions for significance levels; spacing is important.
    mystars <- ifelse(p < .0001, "****", ifelse(p < .001, "*** ", ifelse(p < .01, "**  ", ifelse(p < .05, "*   ", "    "))))
    
    ## trunctuate the correlation matrix to two decimal
    R <- format(round(cbind(rep(-1.11, ncol(x)), R), 4))[,-1]
    
    ## build a new matrix that includes the correlations with their apropriate stars
    Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x))
    diag(Rnew) <- paste(diag(R), " ", sep="")
    rownames(Rnew) <- colnames(x)
    colnames(Rnew) <- paste(colnames(x), "", sep="")
    
    ## remove upper triangle of correlation matrix
    if(removeTriangle[1]=="upper"){
      Rnew <- as.matrix(Rnew)
      Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
      Rnew <- as.data.frame(Rnew)
    }
    
    ## remove lower triangle of correlation matrix
    else if(removeTriangle[1]=="lower"){
      Rnew <- as.matrix(Rnew)
      Rnew[lower.tri(Rnew, diag = TRUE)] <- ""
      Rnew <- as.data.frame(Rnew)
    }
    
    ## remove last column and return the correlation matrix
    Rnew <- cbind(Rnew[1:length(Rnew)-1])
    if (result[1]=="none") return(Rnew)
    else{
      if(result[1]=="html") print(xtable(Rnew), type="html")
      else print(xtable(Rnew), type="latex") 
    }
} 

check technical replicates

correlations

raw_betas <- as.data.frame(raw_betas)



# Raw replicate correlations
raw_rep1 <- raw_betas %>% 
  select(toAverage$C1) %>% 
  corstars(method = "spearman")

raw_rep2 <- raw_betas %>% 
  select(toAverage$C12) %>% 
  corstars(method = "spearman")

raw_rep3 <- raw_betas %>% 
  select(toAverage$C20) %>% 
  corstars(method = "spearman")

raw_rep4 <- raw_betas %>% 
  select(toAverage$C29) %>% 
  corstars(method = "spearman")

raw_rep5 <- raw_betas %>% 
  select(toAverage$C49[c(1,5)]) %>% 
  corstars(method = "spearman")

raw_rep6 <- raw_betas %>% 
  select(toAverage$C49[c(2:4)]) %>% 
  corstars(method = "spearman")

raw_rep7 <- raw_betas %>% 
  select(toAverage$C5) %>% 
  corstars(method = "spearman")

raw_rep8 <- raw_betas %>% 
  select(toAverage$C62) %>% 
  corstars(method = "spearman")

raw_rep9 <- raw_betas %>% 
  select(toAverage$C89[c(1,3)]) %>% 
  corstars(method = "spearman")

raw_rep10 <- raw_betas %>% 
  select(toAverage$C89[c(2,4)]) %>% 
  corstars(method = "spearman")

raw_rep11 <- raw_betas %>% 
  select(toAverage$C91) %>% 
  corstars(method = "spearman")

raw_rep12 <- raw_betas %>% 
  select(toAverage$SV11) %>% 
  corstars(method = "spearman")

raw_rep13 <- raw_betas %>% 
  select(toAverage$SV51) %>% 
  corstars(method = "spearman")

raw_rep14 <- raw_betas %>% 
  select(toAverage$SV61) %>% 
  corstars(method = "spearman")

raw_rep15 <- raw_betas %>% 
  select(toAverage$SV67) %>% 
  corstars(method = "spearman")

raw_rep16 <- raw_betas %>%
  select(toAverage$SV7) %>%
  corstars(method = "spearman")

raw_rep17 <- raw_betas %>%
  select(toAverage$SV76) %>%
  corstars(method = "spearman")

raw_reps <- c(raw_rep1,raw_rep2,raw_rep3,raw_rep4,raw_rep5,raw_rep6,raw_rep7,
              raw_rep8,raw_rep9,raw_rep10,raw_rep11,raw_rep12,raw_rep13,raw_rep14,
              raw_rep15,raw_rep16,raw_rep17)

raw_reps
## $C1M.1.1
## [1] ""            " 0.9752****"
## 
## $C012bb.1.1.
## [1] ""            " 0.9665****"
## 
## $C20bb.1.1_250
## [1] ""            " 0.9657****"
## 
## $C29M.2.1
## [1] ""            " 0.9725****"
## 
## $C49bb.2.1
## [1] ""            " 0.9552****"
## 
## $C49M.1.2
## [1] ""            " 0.9406****" " 0.9366****"
## 
## $C49M.1.2_clean
## [1] ""            ""            " 0.9740****"
## 
## $C6.bb.2.1a
## [1] ""            " 0.9205****"
## 
## $C62M.1.1
## [1] ""            " 0.9668****"
## 
## $C89bb.2.1
## [1] ""            " 0.9548****"
## 
## $C89M2.1
## [1] ""            " 0.9697****"
## 
## $C91bb.1.2
## [1] ""            " 0.9146****"
## 
## $`SV011M.1.1 - eq100`
## [1] ""            " 0.9761****" " 0.9796****"
## 
## $`SV011M.1.1 - eq101`
## [1] ""            ""            " 0.9712****"
## 
## $`SV051M.1 - eq100`
## [1] ""            " 0.9849****" " 0.9818****"
## 
## $`SV051M.1 - eq101`
## [1] ""            ""            " 0.9826****"
## 
## $`SV061bb.2.1 - eq100`
## [1] ""            " 0.9816****" " 0.9722****"
## 
## $`SV061bb.2.1 - eq101`
## [1] ""            ""            " 0.9720****"
## 
## $`SV067bb.1.1 - quad4`
## [1] ""            " 0.9743****" " 0.9737****" " 0.9754****"
## 
## $`SV067bb.1.1 - quad2 - eq100`
## [1] ""            ""            " 0.9748****" " 0.9762****"
## 
## $SV067b.b.1.1
## [1] ""            ""            ""            " 0.9750****"
## 
## $`SV007M.1 - eq100`
## [1] ""            " 0.9812****" " 0.9785****"
## 
## $`SV007M.1.1 - eq101`
## [1] ""            ""            " 0.9802****"
## 
## $`SV076M.1.1 - dup1a`
## [1] ""            " 0.9562****"
raw_correlations <- c(0.9752, # C1M
                      0.9665, # C12bb
                      0.9657, # C20bb
                      0.9725, # C29M
                      0.9552, # C49bb
                      mean(0.9406,0.9366,0.9740), # C49M
                      0.9205, # C6bb
                      0.9668, # C62M
                      0.9548, # C89bb
                      0.9697, # C89M
                      0.9146, # C91bb
                      mean(0.9761,0.9796,0.9712), # SV11M
                      mean(0.9849,0.9818,0.9826), # SV51M
                      mean(0.9816,0.9722,0.9720), # SV61bb
                      mean(0.9743,0.9737,0.9754,0.9748,0.9762,0.9750), # SV67bb
                      mean(0.9812,0.9785,0.9802), # SV7M
                      0.9562) # SV76M


# SeSame Correlations

betas <- as.data.frame(betas)


sesame_rep1 <- betas %>% 
  select(toAverage$C1) %>% 
  corstars(method = "spearman")

sesame_rep2 <- betas %>% 
  select(toAverage$C12) %>% 
  corstars(method = "spearman")

sesame_rep3 <- betas %>% 
  select(toAverage$C20) %>% 
  corstars(method = "spearman")

sesame_rep4 <- betas %>% 
  select(toAverage$C29) %>% 
  corstars(method = "spearman")

sesame_rep5 <- betas %>% 
  select(toAverage$C49[c(1,5)]) %>% 
  corstars(method = "spearman")

sesame_rep6 <- betas %>% 
  select(toAverage$C49[c(2:4)]) %>% 
  corstars(method = "spearman")

sesame_rep7 <- betas %>% 
  select(toAverage$C5) %>% 
  corstars(method = "spearman")

sesame_rep8 <- betas %>% 
  select(toAverage$C62) %>% 
  corstars(method = "spearman")

sesame_rep9 <- betas %>% 
  select(toAverage$C89[c(1,3)]) %>% 
  corstars(method = "spearman")

sesame_rep10 <- betas %>% 
  select(toAverage$C89[c(2,4)]) %>% 
  corstars(method = "spearman")

sesame_rep11 <- betas %>% 
  select(toAverage$C91) %>% 
  corstars(method = "spearman")

sesame_rep12 <- betas %>% 
  select(toAverage$SV11) %>% 
  corstars(method = "spearman")

sesame_rep13 <- betas %>% 
  select(toAverage$SV51) %>% 
  corstars(method = "spearman")

sesame_rep14 <- betas %>% 
  select(toAverage$SV61) %>% 
  corstars(method = "spearman")

sesame_rep15 <- betas %>% 
  select(toAverage$SV67) %>% 
  corstars(method = "spearman")

sesame_rep16 <- betas %>%
  select(toAverage$SV7) %>%
  corstars(method = "spearman")

sesame_rep17 <- betas %>%
  select(toAverage$SV76) %>%
  corstars(method = "spearman")

sesame_reps <- c(sesame_rep1,sesame_rep2,sesame_rep3,sesame_rep4,sesame_rep5,sesame_rep6,sesame_rep7,
              sesame_rep8,sesame_rep9,sesame_rep10,sesame_rep11,sesame_rep12,sesame_rep13,sesame_rep14,
              sesame_rep15,sesame_rep16,sesame_rep17)

sesame_reps
## $C1M.1.1
## [1] ""            " 0.9852****"
## 
## $C012bb.1.1.
## [1] ""            " 0.9688****"
## 
## $C20bb.1.1_250
## [1] ""            " 0.9817****"
## 
## $C29M.2.1
## [1] ""            " 0.9777****"
## 
## $C49bb.2.1
## [1] ""            " 0.9756****"
## 
## $C49M.1.2
## [1] ""            " 0.9645****" " 0.9463****"
## 
## $C49M.1.2_clean
## [1] ""            ""            " 0.9766****"
## 
## $C6.bb.2.1a
## [1] ""            " 0.9197****"
## 
## $C62M.1.1
## [1] ""            " 0.9804****"
## 
## $C89bb.2.1
## [1] ""            " 0.9681****"
## 
## $C89M2.1
## [1] ""            " 0.9821****"
## 
## $C91bb.1.2
## [1] ""            " 0.9255****"
## 
## $`SV011M.1.1 - eq100`
## [1] ""            " 0.9871****" " 0.9879****"
## 
## $`SV011M.1.1 - eq101`
## [1] ""            ""            " 0.9856****"
## 
## $`SV051M.1 - eq100`
## [1] ""            " 0.9893****" " 0.9865****"
## 
## $`SV051M.1 - eq101`
## [1] ""            ""            " 0.9875****"
## 
## $`SV061bb.2.1 - eq100`
## [1] ""            " 0.9889****" " 0.9829****"
## 
## $`SV061bb.2.1 - eq101`
## [1] ""            ""            " 0.9812****"
## 
## $`SV067bb.1.1 - quad4`
## [1] ""            " 0.9851****" " 0.9845****" " 0.9809****"
## 
## $`SV067bb.1.1 - quad2 - eq100`
## [1] ""            ""            " 0.9854****" " 0.9830****"
## 
## $SV067b.b.1.1
## [1] ""            ""            ""            " 0.9829****"
## 
## $`SV007M.1 - eq100`
## [1] ""            " 0.9887****" " 0.9741****"
## 
## $`SV007M.1.1 - eq101`
## [1] ""            ""            " 0.9799****"
## 
## $`SV076M.1.1 - dup1a`
## [1] ""            " 0.9817****"
sesame_correlations <- c(0.9852, # C1M
                      0.9688, # C12bb
                      0.9817, # C20bb
                      0.9777, # C29M
                      0.9756, # C49bb
                      mean(0.9645,0.9463,0.9766), # C49M
                      0.9197, # C6bb
                      0.9804, # C62M
                      0.9681, # C89bb
                      0.9821, # C89M
                      0.9255, # C91bb
                      mean(0.9871,0.9879,0.9856), # SV11M
                      mean(0.9893,0.9865,0.9875), # SV51M
                      mean(0.9889,0.9829,0.9812), # SV61bb
                      mean(0.9851,0.9845,0.9809,0.9854,0.9830,0.9829), # SV67bb
                      mean(0.9887,0.9741,0.9799), # SV7M
                      0.9817) # SV76M



corr_names <- c("C1M","C12bb","C20bb","C29M","C49bb","C49M","C6bb","C62M",
                "C89bb","C89M","C91bb","SV11M","SV51M","SV61bb","SV67bb","SV7M","SV76M")

correlations <- cbind.data.frame(corr_names,raw_correlations,sesame_correlations)

stats <- round(apply(correlations[2:3],2,median), digits = 4)
brow <- c("Median",stats)

correlations <- rbind(correlations,brow)
colnames(correlations) <- c("Replicate","Raw","Preprocessed")


kbl(correlations,
    caption = "Technical Replicate Correlations for Raw and Preprocessed Betas") %>% 
  kable_styling("hover", full_width = F)
Technical Replicate Correlations for Raw and Preprocessed Betas
Replicate Raw Preprocessed
C1M 0.9752 0.9852
C12bb 0.9665 0.9688
C20bb 0.9657 0.9817
C29M 0.9725 0.9777
C49bb 0.9552 0.9756
C49M 0.9406 0.9645
C6bb 0.9205 0.9197
C62M 0.9668 0.9804
C89bb 0.9548 0.9681
C89M 0.9697 0.9821
C91bb 0.9146 0.9255
SV11M 0.9761 0.9871
SV51M 0.9849 0.9893
SV61bb 0.9816 0.9889
SV67bb 0.9743 0.9851
SV7M 0.9812 0.9887
SV76M 0.9562 0.9817
Median 0.9668 0.9817

pca biplots

screeplot <- PCAtools::screeplot
pca <- PCAtools::pca

rownames(df6) <- df6$methylation_id

df_reps <- df6 %>% 
  filter(replicate==TRUE)

df_reps$dyad <- as.factor(df_reps$dyad)

df_reps <- as.data.frame(df_reps)

rownames(df_reps) <- df_reps$methylation_id

raw_betas_reps <- raw_betas %>% 
  select(rav$methylation_id)

sesame_betas_reps <- betas %>% 
  select(rav$methylation_id)

identical(rownames(df_reps),colnames(raw_betas_reps))
## [1] TRUE
p <- pca(na.omit(raw_betas_reps), metadata = df_reps)
q <- pca(na.omit(sesame_betas_reps), metadata = df_reps)

screeplot(p, components = getComponents(p, 1:10))

screeplot(q, components = getComponents(p, 1:10))

biplot(p, lab = p$metadata$dyad ,colby = 'dyad',
                  hline = 0, vline = 0,
                            xlim = c(-100,100),
                  ylim = c(-100,100),
                  legendPosition = 'right',
                  title = 'Raw Betas PCA bi-plot',
                  subtitle = 'PC1 versus PC2')

biplot(q,lab = p$metadata$dyad ,colby = 'dyad',
       hline = 0, vline = 0,
                  xlim = c(-50,50),
                  ylim = c(-50,50),
                  legendPosition = 'right',
                  title = 'SeSame Betas PCA bi-plot',
                  subtitle = 'PC1 versus PC2 - note change in axis scale')

sesame_cor <- eigencorplot(q, metavars = c('Age',
                             'Female',
                             'cohort','palco','setot',
                             'gtsum','achronic','awar_nr'),
             rotLabX = 45,
             cexCorval = 0.7,
             colCorval = 'white',
             col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'),
             posColKey = 'top',
             main = "PCA Correlations",
             signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))
sesame_cor

Samples Failing QC

# These are the samples failing the QC checks
outlier.baby
##                sample.name                      issue
## 124            C006bb.1.2.               Sex mismatch
## 34                C5bb.1.2               Sex mismatch
## 100             C6.bb.2.1a               Sex mismatch
## C91bb.1.2        C91bb.1.2 Methylated vs Unmethylated
## SV001b.b.1.2  SV001b.b.1.2 Methylated vs Unmethylated
## SV001b.b.1.21 SV001b.b.1.2          Detection p-value
## 157           SV001b.b.1.2          X-Y ratio outlier
## 1571          SV001b.b.1.2               Sex mismatch
## 145           SV014b.b.1.1               Sex mismatch
outlier.mother
##            sample.name                      issue
## C084M            C084M Methylated vs Unmethylated
## C086M            C086M Methylated vs Unmethylated
## SV018M.1.1  SV018M.1.1 Methylated vs Unmethylated
## 179        SV043M.2.1a               Sex mismatch
# These samples fail the contamination check.
# These overlap with the results above from the
# meffil QC checks.
contam
##      Sample_Name   outlier
## 260 SV001b.b.1.2 -3.463359
## 293   SV018M.1.1 -3.904566
fail_qc <- c(unique(outlier.baby$sample.name),
             outlier.mother$sample.name)
fail_qc
##  [1] "C006bb.1.2."  "C5bb.1.2"     "C6.bb.2.1a"   "C91bb.1.2"    "SV001b.b.1.2"
##  [6] "SV014b.b.1.1" "C084M"        "C086M"        "SV018M.1.1"   "SV043M.2.1a"
# That's 10 samples to drop failing qc

Remove the samples that failed qc and the probes failing in greater than 10% of all samples after removing failed samples. Then we need to drop the batches that are contributing basically no samples and preventing ComBat from running because all probes have missing data for a subset of 8 samples, for example. After ComBat, then we can remove replicates according to which batch they belong to. This works because samples failing quality control have already been removed, and therefore it doesn’t matter which replicate we keep. Letting batch membership dictate the choice is fine.


ComBat betas

Further sample attrition steps are described in this chunk.

These are betas preprocessed using the SeSame pipeline and then corrected for batch effects using ComBat.

if (file.exists(here("output","combat_betas.rds"))){
  
  combat_betas <- readRDS(here("output","combat_betas.rds"))
  
  df_combat <- df6 %>% 
  filter(methylation_id %in% colnames(combat_betas))

linker <- match(df_combat$methylation_id,colnames(combat_betas))

df_combat <- df_combat[linker,]

identical(df_combat$methylation_id,colnames(combat_betas))


} else {

# First remove those samples failing quality control:

betas_clean <- betas[!(colnames(betas) %in% fail_qc)]

# That takes us from 356 samples to 346 samples.


# Next, remove those samples that belong to small
# batches contributing very few samples. This prevents
# ComBat from working because we have all missing data
# for a number of these batches on many probes.

get_batch_drop <- df6 %>% 
  filter(plate == "A" | plate == "B" | plate == "F") %>% 
  select(methylation_id)

betas_clean_2 <- betas_clean[!(colnames(betas_clean) %in% get_batch_drop$methylation_id)]

# Removing the small batches results in a loss of 23 samples, taking us from
# 346 to 323

# Remove all probes that failed in more than 10% of samples:

c_betas <- betas_clean_2[rowSums(is.na(betas_clean_2)) < ncol(betas_clean_2)*0.10,]

# That takes us from 866553 to 706987.

dim(c_betas)

df_combat <- df6 %>% 
  filter(methylation_id %in% colnames(c_betas))

linker <- match(df_combat$methylation_id,colnames(c_betas))

df_combat <- df_combat[linker,]

identical(df_combat$methylation_id,colnames(c_betas))


# Run ComBat:

mvals <- as.matrix(BetaValueToMValue(c_betas))

# make model matrix
modcombat <- model.matrix(~1, data = df_combat)

# Run ComBat with a parametric Bayesian framework
cvals <- ComBat(
dat = mvals,
batch = df_combat$plate,
mod = modcombat,
par.prior = TRUE, # runs a parametric framework
prior.plots = FALSE, # outputs prior plots
mean.only = FALSE,
ref.batch = NULL)

combat_betas <- as.data.frame(MValueToBetaValue(cvals))
rm(cvals)
rm(mvals)
rm(betas_clean)
rm(betas_clean_2)

saveRDS(combat_betas, file = here("output","combat_betas.rds"))

}
## [1] TRUE

Check the performance of the replicates we have. It won’t be as many as with the raw and sesame checks because we have dropped failed samples and three small batches of samples.

check technical replicates

# correlation


# combat replicate correlations

combat_rep4 <- combat_betas %>% 
  select(toAverage$C29) %>% 
  corstars(method = "spearman")

combat_rep8 <- combat_betas %>% 
  select(toAverage$C62) %>% 
  corstars(method = "spearman")

combat_rep15 <- combat_betas %>% 
  select(toAverage$SV67) %>% 
  corstars(method = "spearman")

combat_rep17 <- combat_betas %>%
  select(toAverage$SV76) %>%
  corstars(method = "spearman")

combat_reps <- c(combat_rep4,combat_rep8,combat_rep15,combat_rep17)

combat_reps
## $C29M.2.1
## [1] ""           " 0.978****"
## 
## $C62M.1.1
## [1] ""            " 0.9852****"
## 
## $`SV067bb.1.1 - quad4`
## [1] ""            " 0.9851****" " 0.9844****" " 0.9835****"
## 
## $`SV067bb.1.1 - quad2 - eq100`
## [1] ""            ""            " 0.9854****" " 0.9840****"
## 
## $SV067b.b.1.1
## [1] ""            ""            ""            " 0.9835****"
## 
## $`SV076M.1.1 - dup1a`
## [1] ""            " 0.9788****"
combat_correlations <- c(0.9780, # C29M
                      0.9852, # C62M
                      mean(0.9851,0.9844,0.9835,0.9854,0.9840,0.9835), # SV67bb
                      0.9788) # SV76M

pca biplots

# PCA biplots

rownames(df_combat) <- df_combat$methylation_id

df_reps <- df_combat %>% 
  mutate(id = paste0(dyad,tissue)) %>% 
  filter(id %in% c("C29mother_venous_blood",
                     "C62mother_venous_blood",
                     "SV67baby_venous_blood",
                     "SV76mother_venous_blood"))

df_reps$dyad <- as.factor(df_reps$dyad)

df_reps <- as.data.frame(df_reps)

rownames(df_reps) <- df_reps$methylation_id

combat_betas_reps <- combat_betas %>% 
  select(df_reps$methylation_id)


identical(rownames(df_reps),colnames(combat_betas_reps))
## [1] TRUE
r <- pca(na.omit(combat_betas_reps), metadata = df_reps)


screeplot(r, components = getComponents(p, 1:10))

biplot(r, lab = r$metadata$dyad ,colby = 'dyad',
                  hline = 0, vline = 0,
                            xlim = c(-50,50),
                  ylim = c(-50,50),
                  legendPosition = 'right',
                  title = 'ComBat Betas PCA bi-plot',
                  subtitle = 'PC1 versus PC2')

Drop replicates & duplicate samples

We can do this by contamination score for technical replicates.

keep_reps <- df_combat %>% 
  mutate(id = paste0(dyad,tissue)) %>% 
  group_by(id) %>% 
  filter(n()>1) %>% 
  arrange(desc(outlier)) %>% # "outlier" is the column with contamination score
  summarise_all(last)

# keep_reps has the 5 I want to keep...but I actually need
# the ones I want to remove...

remove_reps <- df_combat %>% 
  mutate(id = paste0(dyad,tissue)) %>% 
  filter(id %in% keep_reps$id) %>% 
  filter(methylation_id %in% keep_reps$methylation_id==FALSE) %>% 
  select(methylation_id,outlier)

length(remove_reps$methylation_id) # 7 samples to remove prior to EWAS.
## [1] 7
# final combat betas
# Remove replicates
fbetas <- combat_betas[!(colnames(combat_betas) %in% remove_reps$methylation_id)]

# That takes us from 323 samples to 316 samples.

######################################################
# Remove siblings and samples designated for dropping
# HERE
#####################################################

fbetas <- fbetas[!(colnames(fbetas) %in% droppers)]

# That takes us from 316 samples to 302 samples.

dim(fbetas)
## [1] 706987    302
df7 <- df_combat %>% 
  filter(methylation_id %in% colnames(fbetas)) %>% 
  mutate(bmi = mwgt/((mhgt/100)^2)) %>% 
  dplyr::rename(parous = is_this_your_first_child)

Split datasets

# Get mothers dataset
dfm <- df7 %>% 
  filter(tissue == "mother_venous_blood")

mom_betas <- fbetas[colnames(fbetas) %in% dfm$methylation_id]

# Add in cell type proportions for mothers, and first 4-5 PCs
# of cell type

mom_cells <- read.csv(here("output","congo_mother_cell_composition_20220613.csv"))
rownames(mom_cells) <- mom_cells$IID

mom_cells <- mom_cells[-c(1:2)]

mom_pr_cells <- prcomp(as.matrix(mom_cells))

var_explained_mom = mom_pr_cells$sdev^2 / sum(mom_pr_cells$sdev^2)
cat("First PC explains",round(var_explained_mom[1]*100,digits = 2),
    "percent of the variance in cell type.")
## First PC explains 89.22 percent of the variance in cell type.
qplot(c(1:6),var_explained_mom) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot for Mothers Cell Type") +
  theme_light() +
  ylim(0, 1)

mom_pr_cells <- as.data.frame(mom_pr_cells$x) %>% 
  rownames_to_column(var = "methylation_id") %>% 
  rename(PC1_cells = PC1,
         PC2_cells = PC2,
         PC3_cells = PC3,
         PC4_cells = PC4,
         PC5_cells = PC5,
         PC6_cells = PC6)

dfm <- mom_cells %>% 
  rownames_to_column(var = "methylation_id") %>% 
  left_join(mom_pr_cells, by = c("methylation_id" = "methylation_id")) %>% 
  right_join(dfm, by = c("methylation_id" = "methylation_id"))

# Add in top 10 methylation PCs for all mothers

mom_meth_pcs <- prcomp(as.matrix(t(na.omit(mom_betas))))
mom_meth_pcs <- mom_meth_pcs$x[,1:10]

dfm <- as.data.frame(mom_meth_pcs) %>% 
  rownames_to_column(var = "methylation_id") %>% 
  right_join(dfm, by = c("methylation_id" = "methylation_id"))

# Order the metadata so it's the same as the column names order
mlink <- match(dfm$methylation_id, colnames(mom_betas))

dfm <- dfm[mlink,]

dfm <- as.data.frame(dfm)

rownames(dfm) <- dfm$methylation_id



identical(rownames(dfm),colnames(mom_betas)) # TRUE
## [1] TRUE
#####################
# Get babies dataset
####################

dfb <- df7 %>% 
  filter(tissue == "baby_venous_blood")

baby_betas <- fbetas[colnames(fbetas) %in% dfb$methylation_id]

# Add in cell type proportions for babies, and first 4-5 PCs
# of cell type


baby_cells <- read.csv(here("output","congo_baby_cell_composition_20220613.csv"))
rownames(baby_cells) <- baby_cells$IID

baby_cells <- baby_cells[-c(1:2)]

baby_pr_cells <- prcomp(as.matrix(baby_cells))

var_explained_baby = baby_pr_cells$sdev^2 / sum(baby_pr_cells$sdev^2)
cat("First PC explains",round(var_explained_baby[1]*100,digits = 2),
    "percent of the variance in cell type.")
## First PC explains 62.18 percent of the variance in cell type.
cat("First two PCs explain",
    round(var_explained_baby[1]*100,digits = 2) + 
      round(var_explained_baby[2]*100,digits = 2),
    "percent of the variance in cell type.")
## First two PCs explain 90.55 percent of the variance in cell type.
qplot(c(1:7),var_explained_baby) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot for Babies Cell Type") +
  theme_light() +
  ylim(0, 1)

baby_pr_cells <- as.data.frame(baby_pr_cells$x) %>% 
  rownames_to_column(var = "methylation_id") %>% 
  rename(PC1_cells = PC1,
         PC2_cells = PC2,
         PC3_cells = PC3,
         PC4_cells = PC4,
         PC5_cells = PC5,
         PC6_cells = PC6,
         PC7_cells = PC7)

dfb <- baby_cells %>% 
  rownames_to_column(var = "methylation_id") %>% 
  left_join(baby_pr_cells, by = c("methylation_id" = "methylation_id")) %>% 
  right_join(dfb, by = c("methylation_id" = "methylation_id"))

# Add in top 10 methylation PCs for all babies
baby_meth_pcs <- prcomp(as.matrix(t(na.omit(baby_betas))))
baby_meth_pcs <- baby_meth_pcs$x[,1:10]

dfb <- as.data.frame(baby_meth_pcs) %>% 
  rownames_to_column(var = "methylation_id") %>% 
  right_join(dfb, by = c("methylation_id" = "methylation_id"))



# Order the metadata so it's the same as the column names order
blink <- match(dfb$methylation_id, colnames(baby_betas))

dfb <- dfb[blink,]

dfb <- as.data.frame(dfb)

rownames(dfb) <- dfb$methylation_id

identical(rownames(dfb),colnames(baby_betas)) # TRUE
## [1] TRUE


This leaves us with 151 mothers and 151 babies.



EDA

 # Use this package to easily
# distinguish between the two cohorts in pairs plots.

# Mothers

ppcols1 <- c("gtsum","setot","achronic","awar_nr",
             "bmi","Age","sex","pcsec","parous","ga_meth")
ppcols2 <- c("gtsum","setot","achronic","awar_nr","Neu",
             "NK","Bcell","CD4T","CD8T","Mono")


# Pairs plot of some demographics and cell type
pm1 <- ggpairs(dfm, columns = ppcols1, ggplot2::aes(color=cohort)) +
  theme_bw()

pm2 <- ggpairs(dfm, columns = ppcols2, ggplot2::aes(color=cohort)) +
  theme_bw()

pm1

pm2

# Correlation Matrix of methylation PCs, covariates, predictors, and cell types

cordat <- dfm %>% 
  select(PC1:PC10,gtsum,setot,achronic,awar_nr,
         bmi,Age,ga_meth,Neu,NK,
         Bcell,CD4T,CD8T,Mono,bwgt)

cormat <- corstars(as.matrix(cordat))

kbl(cormat, digits = 2,
    caption = "Pearson Correlations for Predictors, Covariates, and Cell Types") %>% 
  kable_styling("hover")
Pearson Correlations for Predictors, Covariates, and Cell Types
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 gtsum setot achronic awar_nr bmi Age ga_meth Neu NK Bcell CD4T CD8T Mono
PC1
PC2 0.0000
PC3 0.0000 0.0000
PC4 0.0000 0.0000 0.0000
PC5 0.0000 0.0000 0.0000 0.0000
PC6 0.0000 0.0000 0.0000 0.0000 0.0000
PC7 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
PC8 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
PC9 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
PC10 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
gtsum -0.0282 0.0791 -0.0355 -0.0462 -0.0523 -0.1814* 0.1798* -0.0383 0.0234 -0.1142
setot 0.1254 -0.0377 -0.0882 0.0497 -0.0593 -0.0222 0.0312 0.1030 0.0018 0.0320 0.3974****
achronic 0.0684 -0.0592 -0.1687* 0.0189 -0.0707 -0.0252 0.0586 0.2042* 0.0260 0.0055 0.2953*** 0.4372****
awar_nr 0.0207 0.0621 0.0578 -0.0683 0.0297 -0.0906 0.0838 0.0640 0.1381 -0.1085 0.5886**** 0.3659**** 0.2398**
bmi -0.0077 0.0452 0.1195 -0.0677 -0.0721 -0.0066 -0.1299 -0.2683*** -0.1075 -0.0085 -0.0048 -0.1311 -0.2953*** -0.0773
Age -0.0057 0.0654 -0.0059 -0.0255 0.1680* 0.0392 -0.0303 -0.3140**** 0.0303 -0.0140 -0.0118 -0.3045*** -0.4504**** 0.0039 0.3845****
ga_meth -0.0594 0.0351 0.1420 0.0725 0.0224 -0.0301 0.0118 -0.0872 -0.0100 -0.0417 0.0328 0.0269 -0.1167 -0.0284 0.1783* 0.2520**
Neu 0.9339**** -0.0756 0.1033 0.1012 0.0423 -0.0544 0.0289 0.0266 0.0445 -0.0676 0.0079 0.1406 0.0610 0.0567 -0.0398 -0.0227 -0.0410
NK -0.7527**** -0.0547 -0.0559 -0.0589 0.2326** -0.0297 -0.0961 0.0282 0.1675* 0.0768 -0.0252 -0.1172 -0.0832 -0.0475 -0.0527 -0.0065 -0.0398 -0.7456****
Bcell -0.6540**** 0.3027*** -0.2324** -0.2193** 0.0725 -0.1111 -0.0798 -0.0592 -0.0876 0.1306 -0.0086 -0.0617 0.0601 -0.0222 -0.0006 -0.0612 0.0184 -0.7384**** 0.4985****
CD4T -0.7989**** 0.1696* -0.1042 -0.1311 -0.2312** 0.1313 0.0045 -0.0605 -0.1273 -0.0036 -0.0121 -0.1587 -0.0657 -0.0426 0.1074 0.0530 0.0574 -0.8534**** 0.4594**** 0.6195****
CD8T -0.8630**** -0.0513 -0.0199 -0.0331 0.1135 0.1228 -0.0207 0.0601 -0.0149 -0.0015 -0.0369 -0.1013 -0.0922 -0.0772 -0.0604 0.0022 0.0213 -0.8317**** 0.6857**** 0.5051**** 0.5671****
Mono -0.3329**** -0.0180 -0.0621 -0.0062 -0.0804 -0.2048* 0.0196 -0.0998 -0.0060 0.1421 0.0924 -0.0222 0.0340 0.0220 0.0942 0.0371 0.0689 -0.5305**** 0.3487**** 0.3309**** 0.2696*** 0.2828***
bwgt -0.0519 0.0275 -0.0116 0.0651 -0.0506 -0.0783 -0.1097 -0.2318** 0.1098 -0.0360 -0.0477 -0.0196 -0.2889*** -0.1879* 0.3984**** 0.4469**** 0.4500**** -0.0665 0.0569 -0.0017 0.0382 0.0673 0.0895
# Heatmap

d <- cor(cordat, use = "pairwise.complete.obs")
d <- round(d, 1)



p.mat <- cor(d[,-1])


##############
# HEATMAP
##############

cor.mtest <- function(mat, ...) {
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(cordat)

corrplot::corrplot(d, 
     method="color", 
     #col=col(200),  
     type="upper", 
     #order="hclust", 
     addCoef.col = "black", 
     tl.col="black",
     number.cex = 0.7,
     tl.cex = 0.6,
     tl.srt=45,
     p.mat =p.mat,
     sig.level = 0.5,
     insig = "label_sig")


EWAS

Use robust regression for this. With babies, control for infant sex, maternal BMI, alcohol, parity, maternal age, cohort, cell type PCs 1 and 2, and then add the maternal stress predictor.

mom_betas <- t(mom_betas)

identical(rownames(mom_betas),rownames(dfm))
## [1] TRUE
baby_betas <- t(baby_betas)

identical(rownames(baby_betas),rownames(dfb))
## [1] TRUE




mod_sum <- function(models){
  broom::tidy(lmtest::coeftest(models, vcov. = sandwich), conf.int = TRUE)
}

mod_sum_robust <- function (models) {
 
 dat <- tryCatch(
        {
         broom::tidy(lmtest::coeftest(models, vcov. = sandwich), conf.int = TRUE)      
        },
         error = function(cond) {
                 return(NA)
        },
         warning = function(cond) {
                  return(NULL)
                  }
   )
  return(dat)   
}


getValues <- function(x) {
  coef <- sapply(x, function (x) x[[length(x[[1]]),"estimate"]])
  std_error <- sapply(x, function (x) x[[length(x[[1]]),"std.error"]])
  test_stat <- sapply(x, function (x) x[[length(x[[1]]),"statistic"]])
  pval <- sapply(x, function (x) x[[length(x[[1]]),"p.value"]])
  conf_low <- sapply(x, function (x) x[[length(x[[1]]),"conf.low"]])
  conf_high <- sapply(x, function (x) x[[length(x[[1]]),"conf.high"]])
  df <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)
  return(df)
}



get_Values_Direct_Robust <- function(x) {
  
  df <- tryCatch(
    
        {
  coef <- sapply(x, function (x) x[[length(x[[1]]),"estimate"]])
  std_error <- sapply(x, function (x) x[[length(x[[1]]),"std.error"]])
  test_stat <- sapply(x, function (x) x[[length(x[[1]]),"statistic"]])
  pval <- sapply(x, function (x) x[[length(x[[1]]),"p.value"]])
  conf_low <- sapply(x, function (x) x[[length(x[[1]]),"conf.low"]])
  conf_high <- sapply(x, function (x) x[[length(x[[1]]),"conf.high"]])
  df <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)
          
        },
  
         error = function(cond) {
                 return(NA)
        },
  
         warning = function(cond) {
                  return(NULL)
           
                  }
   )
  
  return(df)   
}

mothers

gtsum

1:250K

if (file.exists(here("output","mom_gtsum_params1_cell_corrected.rds"))){
  mom_gtsum_params1_cell_corrected <- readRDS(here("output","mom_gtsum_params1_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_gtsum_1 <- parApply(cl,mom_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      gtsum,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)


cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_gtsum_results1_cell_corrected <- parLapply(cl,regs_mom_gtsum_1, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_gtsum_1)

mom_gtsum_results1_cell_corrected <- Filter(Negate(anyNA), mom_gtsum_results1_cell_corrected)

list.save(mom_gtsum_results1_cell_corrected, file = here("output","mom_gtsum_results1_cell_corrected.rds"))

mom_gtsum_params1_cell_corrected <- getValues(mom_gtsum_results1_cell_corrected)

saveRDS(mom_gtsum_params1_cell_corrected, here("output","mom_gtsum_params1_cell_corrected.rds"))

}

250K:500K

if (file.exists(here("output","mom_gtsum_params2_cell_corrected.rds"))){
  mom_gtsum_params2_cell_corrected <- readRDS(here("output","mom_gtsum_params2_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_gtsum_2 <- parApply(cl,mom_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      gtsum,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)


cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_gtsum_results2_cell_corrected <- parLapply(cl,regs_mom_gtsum_2, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_gtsum_2)

mom_gtsum_results2_cell_corrected <- Filter(Negate(anyNA), mom_gtsum_results2_cell_corrected)

list.save(mom_gtsum_results2_cell_corrected, file = here("output","mom_gtsum_results2_cell_corrected.rds"))

mom_gtsum_params2_cell_corrected <- getValues(mom_gtsum_results2_cell_corrected)

saveRDS(mom_gtsum_params2_cell_corrected, here("output","mom_gtsum_params2_cell_corrected.rds"))

}

500K:ncol

if (file.exists(here("output","mom_gtsum_params3_cell_corrected.rds"))){
  mom_gtsum_params3_cell_corrected <- readRDS(here("output","mom_gtsum_params3_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_gtsum_3 <- parApply(cl,mom_betas[,500001:ncol(mom_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      gtsum,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)


cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_gtsum_results3_cell_corrected <- parLapply(cl,regs_mom_gtsum_3, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_gtsum_3)

mom_gtsum_results3_cell_corrected <- Filter(Negate(anyNA), mom_gtsum_results3_cell_corrected)

list.save(mom_gtsum_results3_cell_corrected, file = here("output","mom_gtsum_results3_cell_corrected.rds"))

mom_gtsum_params3_cell_corrected <- getValues(mom_gtsum_results3_cell_corrected)

saveRDS(mom_gtsum_params3_cell_corrected, here("output","mom_gtsum_params3_cell_corrected.rds"))

}

mothers

setot

1:250K

if (file.exists(here("output","mom_setot_params1_cell_corrected.rds"))){
  mom_setot_params1_cell_corrected <- readRDS(here("output","mom_setot_params1_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_setot_1 <- parApply(cl,mom_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      setot,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)


cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_setot_results1_cell_corrected <- parLapply(cl,regs_mom_setot_1, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_setot_1)

mom_setot_results1_cell_corrected <- Filter(Negate(anyNA), mom_setot_results1_cell_corrected)

list.save(mom_setot_results1_cell_corrected, file = here("output","mom_setot_results1_cell_corrected.rds"))

mom_setot_params1_cell_corrected <- getValues(mom_setot_results1_cell_corrected)

saveRDS(mom_setot_params1_cell_corrected, here("output","mom_setot_params1_cell_corrected.rds"))

}
if (file.exists(here("output","mom_setot_params2_cell_corrected.rds"))){
  mom_setot_params2_cell_corrected <- readRDS(here("output","mom_setot_params2_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_setot_2 <- parApply(cl,mom_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      setot,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)



cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_setot_results2_cell_corrected <- parLapply(cl,regs_mom_setot_2, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_setot_2)

mom_setot_results2_cell_corrected <- Filter(Negate(anyNA), mom_setot_results2_cell_corrected)

list.save(mom_setot_results2_cell_corrected, file = here("output","mom_setot_results2_cell_corrected.rds"))

mom_setot_params2_cell_corrected <- getValues(mom_setot_results2_cell_corrected)

saveRDS(mom_setot_params2_cell_corrected, here("output","mom_setot_params2_cell_corrected.rds"))

}
if (file.exists(here("output","mom_setot_params3_cell_corrected.rds"))){
  mom_setot_params3_cell_corrected <- readRDS(here("output","mom_setot_params3_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_setot_3 <- parApply(cl,mom_betas[,500001:ncol(mom_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      setot,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)



cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_setot_results3_cell_corrected <- parLapply(cl,regs_mom_setot_3, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_setot_3)

mom_setot_results3_cell_corrected <- Filter(Negate(anyNA), mom_setot_results3_cell_corrected)

list.save(mom_setot_results3_cell_corrected, file = here("output","mom_setot_results3_cell_corrected.rds"))

mom_setot_params3_cell_corrected <- getValues(mom_setot_results3_cell_corrected)

saveRDS(mom_setot_params3_cell_corrected, here("output","mom_setot_params3_cell_corrected.rds"))

}

mothers

achronic

1:250K

if (file.exists(here("output","mom_achronic_params1_cell_corrected.rds"))){
  mom_achronic_params1_cell_corrected <- readRDS(here("output","mom_achronic_params1_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_achronic_1 <- parApply(cl,mom_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      achronic,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)



cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_achronic_results1_cell_corrected <- parLapply(cl,regs_mom_achronic_1, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_achronic_1)

mom_achronic_results1_cell_corrected <- Filter(Negate(anyNA), mom_achronic_results1_cell_corrected)

list.save(mom_achronic_results1_cell_corrected, file = here("output","mom_achronic_results1_cell_corrected.rds"))

mom_achronic_params1_cell_corrected <- getValues(mom_achronic_results1_cell_corrected)

saveRDS(mom_achronic_params1_cell_corrected, here("output","mom_achronic_params1_cell_corrected.rds"))

}
if (file.exists(here("output","mom_achronic_params2_cell_corrected.rds"))){
  mom_achronic_params2_cell_corrected <- readRDS(here("output","mom_achronic_params2_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_achronic_2 <- parApply(cl,mom_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      achronic,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)



cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_achronic_results2_cell_corrected <- parLapply(cl,regs_mom_achronic_2, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_achronic_2)

mom_achronic_results2_cell_corrected <- Filter(Negate(anyNA), mom_achronic_results2_cell_corrected)

list.save(mom_achronic_results2_cell_corrected, file = here("output","mom_achronic_results2_cell_corrected.rds"))

mom_achronic_params2_cell_corrected <- getValues(mom_achronic_results2_cell_corrected)

saveRDS(mom_achronic_params2_cell_corrected, here("output","mom_achronic_params2_cell_corrected.rds"))

}
if (file.exists(here("output","mom_achronic_params3_cell_corrected.rds"))){
  mom_achronic_params3_cell_corrected <- readRDS(here("output","mom_achronic_params3_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_achronic_3 <- parApply(cl,mom_betas[,500001:ncol(mom_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      achronic,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_achronic_results3_cell_corrected <- parLapply(cl,regs_mom_achronic_3, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_achronic_3)

mom_achronic_results3_cell_corrected <- Filter(Negate(anyNA), mom_achronic_results3_cell_corrected)

list.save(mom_achronic_results3_cell_corrected, file = here("output","mom_achronic_results3_cell_corrected.rds"))

mom_achronic_params3_cell_corrected <- getValues(mom_achronic_results3_cell_corrected)

saveRDS(mom_achronic_params3_cell_corrected, here("output","mom_achronic_params3_cell_corrected.rds"))

}

mothers

awar_nr

1:250K

if (file.exists(here("output","mom_awar_nr_params1_cell_corrected.rds"))){
  mom_awar_nr_params1_cell_corrected <- readRDS(here("output","mom_awar_nr_params1_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_awar_nr_1 <- parApply(cl,mom_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      awar_nr,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_awar_nr_results1_cell_corrected <- parLapply(cl,regs_mom_awar_nr_1, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_awar_nr_1)

mom_awar_nr_results1_cell_corrected <- Filter(Negate(anyNA), mom_awar_nr_results1_cell_corrected)

list.save(mom_awar_nr_results1_cell_corrected, file = here("output","mom_awar_nr_results1_cell_corrected.rds"))

mom_awar_nr_params1_cell_corrected <- getValues(mom_awar_nr_results1_cell_corrected)

saveRDS(mom_awar_nr_params1_cell_corrected, here("output","mom_awar_nr_params1_cell_corrected.rds"))

}
if (file.exists(here("output","mom_awar_nr_params2_cell_corrected.rds"))){
  mom_awar_nr_params2_cell_corrected <- readRDS(here("output","mom_awar_nr_params2_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_awar_nr_2 <- parApply(cl,mom_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      awar_nr,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_awar_nr_results2_cell_corrected <- parLapply(cl,regs_mom_awar_nr_2, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_awar_nr_2)

mom_awar_nr_results2_cell_corrected <- Filter(Negate(anyNA), mom_awar_nr_results2_cell_corrected)

list.save(mom_awar_nr_results2_cell_corrected, file = here("output","mom_awar_nr_results2_cell_corrected.rds"))

mom_awar_nr_params2_cell_corrected <- getValues(mom_awar_nr_results2_cell_corrected)

saveRDS(mom_awar_nr_params2_cell_corrected, here("output","mom_awar_nr_params2_cell_corrected.rds"))

}
if (file.exists(here("output","mom_awar_nr_params3_cell_corrected.rds"))){
  mom_awar_nr_params3_cell_corrected <- readRDS(here("output","mom_awar_nr_params3_cell_corrected.rds"))

} else {

identical(rownames(dfm),rownames(mom_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfm")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_mom_awar_nr_3 <- parApply(cl,mom_betas[,500001:ncol(mom_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ Age +
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      cohort +
                      PC1_cells +
                      awar_nr,
                      data = dfm)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

cl <- makeCluster(16)
registerDoParallel(cl)
clusterExport(cl, c("tidy","mod_sum_robust","coeftest","sandwich"))

mom_awar_nr_results3_cell_corrected <- parLapply(cl,regs_mom_awar_nr_3, mod_sum_robust)

stopCluster(cl)

rm(regs_mom_awar_nr_3)

mom_awar_nr_results3_cell_corrected <- Filter(Negate(anyNA), mom_awar_nr_results3_cell_corrected)

list.save(mom_awar_nr_results3_cell_corrected, file = here("output","mom_awar_nr_results3_cell_corrected.rds"))

mom_awar_nr_params3_cell_corrected <- getValues(mom_awar_nr_results3_cell_corrected)

saveRDS(mom_awar_nr_params3_cell_corrected, here("output","mom_awar_nr_params3_cell_corrected.rds"))

}

babies

gtsum

1:250K

if (file.exists(here("output","baby_gtsum_params1_cell_corrected.rds"))){
  baby_gtsum_params1_cell_corrected <- readRDS(here("output","baby_gtsum_params1_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_gtsum_1 <- parApply(cl,baby_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      gtsum,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_gtsum_results1_cell_corrected <- lapply(regs_baby_gtsum_1, mod_sum_robust)

rm(regs_baby_gtsum_1)

baby_gtsum_results1_cell_corrected <- Filter(Negate(anyNA), baby_gtsum_results1_cell_corrected)

list.save(baby_gtsum_results1_cell_corrected, file = here("output","baby_gtsum_results1_cell_corrected.rds"))

baby_gtsum_params1_cell_corrected <- getValues(baby_gtsum_results1_cell_corrected)

saveRDS(baby_gtsum_params1_cell_corrected, here("output","baby_gtsum_params1_cell_corrected.rds"))

}

gtsum

250k:500k

if (file.exists(here("output","baby_gtsum_params2_cell_corrected.rds"))){
  baby_gtsum_params2_cell_corrected <- readRDS(here("output","baby_gtsum_params2_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_gtsum_2 <- parApply(cl,baby_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      gtsum,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_gtsum_results2_cell_corrected <- lapply(regs_baby_gtsum_2, mod_sum_robust)

rm(regs_baby_gtsum_2)

baby_gtsum_results2_cell_corrected <- Filter(Negate(anyNA), baby_gtsum_results2_cell_corrected)

list.save(baby_gtsum_results2_cell_corrected, file = here("output","baby_gtsum_results2_cell_corrected.rds"))

baby_gtsum_params2_cell_corrected <- getValues(baby_gtsum_results2_cell_corrected)

saveRDS(baby_gtsum_params2_cell_corrected, here("output","baby_gtsum_params2_cell_corrected.rds"))

}

gtsum

500k:ncol

if (file.exists(here("output","baby_gtsum_params3_cell_corrected.rds"))){
  baby_gtsum_params3_cell_corrected <- readRDS(here("output","baby_gtsum_params3_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_gtsum_3 <- parApply(cl,baby_betas[,500001:ncol(baby_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      gtsum,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_gtsum_results3_cell_corrected <- lapply(regs_baby_gtsum_3, mod_sum_robust)


rm(regs_baby_gtsum_3)

baby_gtsum_results3_cell_corrected <- Filter(Negate(anyNA), baby_gtsum_results3_cell_corrected)

list.save(baby_gtsum_results3_cell_corrected, file = here("output","baby_gtsum_results3_cell_corrected.rds"))

baby_gtsum_params3_cell_corrected <- getValues(baby_gtsum_results3_cell_corrected)

saveRDS(baby_gtsum_params3_cell_corrected, here("output","baby_gtsum_params3_cell_corrected.rds"))

}

setot

1:250K

if (file.exists(here("output","baby_setot_params1_cell_corrected.rds"))){
  baby_setot_params1_cell_corrected <- readRDS(here("output","baby_setot_params1_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_setot_1 <- parApply(cl,baby_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      setot,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.


baby_setot_results1_cell_corrected <- lapply(regs_baby_setot_1, mod_sum_robust)


rm(regs_baby_setot_1)

baby_setot_results1_cell_corrected <- Filter(Negate(anyNA), baby_setot_results1_cell_corrected)

list.save(baby_setot_results1_cell_corrected, file = here("output","baby_setot_results1_cell_corrected.rds"))

baby_setot_params1_cell_corrected <- getValues(baby_setot_results1_cell_corrected)

saveRDS(baby_setot_params1_cell_corrected, here("output","baby_setot_params1_cell_corrected.rds"))

}

setot

250k:500k

if (file.exists(here("output","baby_setot_params2_cell_corrected.rds"))){
  baby_setot_params2_cell_corrected <- readRDS(here("output","baby_setot_params2_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_setot_2 <- parApply(cl,baby_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      setot,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_setot_results2_cell_corrected <- lapply(regs_baby_setot_2, mod_sum_robust)


rm(regs_baby_setot_2)

baby_setot_results2_cell_corrected <- Filter(Negate(anyNA), baby_setot_results2_cell_corrected)

list.save(baby_setot_results2_cell_corrected, file = here("output","baby_setot_results2_cell_corrected.rds"))

baby_setot_params2_cell_corrected <- getValues(baby_setot_results2_cell_corrected)

saveRDS(baby_setot_params2_cell_corrected, here("output","baby_setot_params2_cell_corrected.rds"))

}

setot

500k:ncol

if (file.exists(here("output","baby_setot_params3_cell_corrected.rds"))){
  baby_setot_params3_cell_corrected <- readRDS(here("output","baby_setot_params3_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_setot_3 <- parApply(cl,baby_betas[,500001:ncol(baby_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      setot,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_setot_results3_cell_corrected <- lapply(regs_baby_setot_3, mod_sum_robust)


rm(regs_baby_setot_3)

baby_setot_results3_cell_corrected <- Filter(Negate(anyNA), baby_setot_results3_cell_corrected)

list.save(baby_setot_results3_cell_corrected, file = here("output","baby_setot_results3_cell_corrected.rds"))

baby_setot_params3_cell_corrected <- getValues(baby_setot_results3_cell_corrected)

saveRDS(baby_setot_params3_cell_corrected, here("output","baby_setot_params3_cell_corrected.rds"))

}

achronic

1:250K

if (file.exists(here("output","baby_achronic_params1_cell_corrected.rds"))){
  baby_achronic_params1_cell_corrected <- readRDS(here("output","baby_achronic_params1_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_achronic_1 <- parApply(cl,baby_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      achronic,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.


baby_achronic_results1_cell_corrected <- lapply(regs_baby_achronic_1, mod_sum_robust)


rm(regs_baby_achronic_1)

baby_achronic_results1_cell_corrected <- Filter(Negate(anyNA), baby_achronic_results1_cell_corrected)

list.save(baby_achronic_results1_cell_corrected, file = here("output","baby_achronic_results1_cell_corrected.rds"))

baby_achronic_params1_cell_corrected <- getValues(baby_achronic_results1_cell_corrected)

saveRDS(baby_achronic_params1_cell_corrected, here("output","baby_achronic_params1_cell_corrected.rds"))

}

achronic

250k:500k

if (file.exists(here("output","baby_achronic_params2_cell_corrected.rds"))){
  baby_achronic_params2_cell_corrected <- readRDS(here("output","baby_achronic_params2_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_achronic_2 <- parApply(cl,baby_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      achronic,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_achronic_results2_cell_corrected <- lapply(regs_baby_achronic_2, mod_sum_robust)


rm(regs_baby_achronic_2)

baby_achronic_results2_cell_corrected <- Filter(Negate(anyNA), baby_achronic_results2_cell_corrected)

list.save(baby_achronic_results2_cell_corrected, file = here("output","baby_achronic_results2_cell_corrected.rds"))

baby_achronic_params2_cell_corrected <- getValues(baby_achronic_results2_cell_corrected)

saveRDS(baby_achronic_params2_cell_corrected, here("output","baby_achronic_params2_cell_corrected.rds"))

}

achronic

500k:ncol

if (file.exists(here("output","baby_achronic_params3_cell_corrected.rds"))){
  baby_achronic_params3_cell_corrected <- readRDS(here("output","baby_achronic_params3_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_achronic_3 <- parApply(cl,baby_betas[,500001:ncol(baby_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      achronic,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_achronic_results3_cell_corrected <- lapply(regs_baby_achronic_3, mod_sum_robust)


rm(regs_baby_achronic_3)

baby_achronic_results3_cell_corrected <- Filter(Negate(anyNA), baby_achronic_results3_cell_corrected)

list.save(baby_achronic_results3_cell_corrected, file = here("output","baby_achronic_results3_cell_corrected.rds"))

baby_achronic_params3_cell_corrected <- getValues(baby_achronic_results3_cell_corrected)

saveRDS(baby_achronic_params3_cell_corrected, here("output","baby_achronic_params3_cell_corrected.rds"))

}

awar_nr

1:250K

if (file.exists(here("output","baby_awar_nr_params1_cell_corrected.rds"))){
  baby_awar_nr_params1_cell_corrected <- readRDS(here("output","baby_awar_nr_params1_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_awar_nr_1 <- parApply(cl,baby_betas[,1:250000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      awar_nr,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.


baby_awar_nr_results1_cell_corrected <- lapply(regs_baby_awar_nr_1, mod_sum_robust)


rm(regs_baby_awar_nr_1)

baby_awar_nr_results1_cell_corrected <- Filter(Negate(anyNA), baby_awar_nr_results1_cell_corrected)

list.save(baby_awar_nr_results1_cell_corrected, file = here("output","baby_awar_nr_results1_cell_corrected.rds"))

baby_awar_nr_params1_cell_corrected <- getValues(baby_awar_nr_results1_cell_corrected)

saveRDS(baby_awar_nr_params1_cell_corrected, here("output","baby_awar_nr_params1_cell_corrected.rds"))

}

awar_nr

250k:500k

if (file.exists(here("output","baby_awar_nr_params2_cell_corrected.rds"))){
  baby_awar_nr_params2_cell_corrected <- readRDS(here("output","baby_awar_nr_params2_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# # This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_awar_nr_2 <- parApply(cl,baby_betas[,250001:500000],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      awar_nr,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_awar_nr_results2_cell_corrected <- lapply(regs_baby_awar_nr_2, mod_sum_robust)


rm(regs_baby_awar_nr_2)

baby_awar_nr_results2_cell_corrected <- Filter(Negate(anyNA), baby_awar_nr_results2_cell_corrected)

list.save(baby_awar_nr_results2_cell_corrected, file = here("output","baby_awar_nr_results2_cell_corrected.rds"))

baby_awar_nr_params2_cell_corrected <- getValues(baby_awar_nr_results2_cell_corrected)

saveRDS(baby_awar_nr_params2_cell_corrected, here("output","baby_awar_nr_params2_cell_corrected.rds"))

}

awar_nr

500k:ncol

if (file.exists(here("output","baby_awar_nr_params3_cell_corrected.rds"))){
  baby_awar_nr_params3_cell_corrected <- readRDS(here("output","baby_awar_nr_params3_cell_corrected.rds"))
  
} else {

identical(rownames(dfb),rownames(baby_betas))

# make socket and register
cl <- makeCluster(16)

registerDoParallel(cl)

# export phenotype data only to each worker

clusterExport(cl,"dfb")
clusterExport(cl, c("rlm"))

# Create an error message we can search for easily
error_message <- function() 'fail'

# Export the error_message function to all workers
clusterExport(cl,"error_message")

# This takes 6 minutes with 92 GB RAM.

system.time(
regs_baby_awar_nr_3 <- parApply(cl,baby_betas[,500001:ncol(baby_betas)],2, function(x)

  tryCatch(

  {

    all_models <- rlm(x ~ age + # lower case age is mother's age
                      bmi +
                      parous +
                      pcsec +
                      palco +
                      sex +
                      ga_meth +
                      cohort +
                      PC1_cells +
                      PC2_cells +
                      awar_nr,
                      data = dfb)

    return(all_models)

    },

  error = function(e){

        error_message()

    })))

stopCluster(cl)

# This takes 4 minutes with 92 GB RAM.

baby_awar_nr_results3_cell_corrected <- lapply(regs_baby_awar_nr_3, mod_sum_robust)


rm(regs_baby_awar_nr_3)

baby_awar_nr_results3_cell_corrected <- Filter(Negate(anyNA), baby_awar_nr_results3_cell_corrected)

list.save(baby_awar_nr_results3_cell_corrected, file = here("output","baby_awar_nr_results3_cell_corrected.rds"))

baby_awar_nr_params3_cell_corrected <- getValues(baby_awar_nr_results3_cell_corrected)

saveRDS(baby_awar_nr_params3_cell_corrected, here("output","baby_awar_nr_params3_cell_corrected.rds"))

}

Results

mom_gtsum_results <- mom_gtsum_params1_cell_corrected %>% 
  bind_rows(mom_gtsum_params2_cell_corrected) %>% 
  bind_rows(mom_gtsum_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr")) %>% 
  rownames_to_column(var = "probe")

mom_setot_results <- mom_setot_params1_cell_corrected %>% 
  bind_rows(mom_setot_params2_cell_corrected) %>% 
  bind_rows(mom_setot_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")

mom_achronic_results <- mom_achronic_params1_cell_corrected %>% 
  bind_rows(mom_achronic_params2_cell_corrected) %>% 
  bind_rows(mom_achronic_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")

mom_awar_nr_results <- mom_awar_nr_params1_cell_corrected %>% 
  bind_rows(mom_awar_nr_params2_cell_corrected) %>% 
  bind_rows(mom_awar_nr_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")


# head(sort(mom_gtsum_results$pval), n=10)
# head(sort(mom_setot_results$pval), n=10)
# head(sort(mom_achronic_results$pval), n=10)
# head(sort(mom_awar_nr_results$pval), n=10)





baby_gtsum_results <- baby_gtsum_params1_cell_corrected %>% 
  bind_rows(baby_gtsum_params2_cell_corrected) %>% 
  bind_rows(baby_gtsum_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")

baby_setot_results <- baby_setot_params1_cell_corrected %>% 
  bind_rows(baby_setot_params2_cell_corrected) %>% 
  bind_rows(baby_setot_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")

baby_achronic_results <- baby_achronic_params1_cell_corrected %>% 
  bind_rows(baby_achronic_params2_cell_corrected) %>% 
  bind_rows(baby_achronic_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")

baby_awar_nr_results <- baby_awar_nr_params1_cell_corrected %>% 
  bind_rows(baby_awar_nr_params2_cell_corrected) %>% 
  bind_rows(baby_awar_nr_params3_cell_corrected) %>% 
  mutate(p_log10 = -1*log10(pval)) %>% 
  mutate(fdr = p.adjust(pval, method = "fdr"))%>% 
  rownames_to_column(var = "probe")


# head(sort(baby_gtsum_results$pval), n=10)
# head(sort(baby_setot_results$pval), n=10)
# head(sort(baby_achronic_results$pval), n=10)
# head(sort(baby_awar_nr_results$pval), n=10)



Manhattan

# make Manhattan plots for all eight analyses.

# gtsum mothers. 

# One model failed so there are 706986 probes here.

mom_gtsum_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>% 
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(mom_gtsum_results, by = c("probe" = "probe")) %>% 
  filter(probeType == "cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # This removes 6 probes. 706980 total.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22","X")))

mom_gtsum_man2 <- mom_gtsum_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(mom_gtsum_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- mom_gtsum_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )



mom_gtsum_man <- ggplot(mom_gtsum_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(mom_gtsum_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = mom_gtsum_man2[mom_gtsum_man2$p_log10 > 
   #                                      -log10(0.05/nrow(mom_gtsum_man2)), ],
   #           box.padding = 0.6) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 23 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("General trauma in mothers") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))



# setot mothers

# one model failed here. 706986 probes.

mom_setot_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>%
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(mom_setot_results, by = c("probe" = "probe")) %>% 
  filter(probeType=="cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # This removes 6 y chromosome probes. 706980 probes left.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22","X")))

mom_setot_man2 <- mom_setot_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(mom_setot_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- mom_setot_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )


mom_setot_man <- ggplot(mom_setot_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(mom_setot_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = mom_setot_man2[mom_setot_man2$p_log10 > 
   #                                      -log10(0.05/nrow(mom_setot_man2)), ],
   #           box.padding = 0.6, max.overlaps = Inf) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 23 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("Sexual trauma in mothers") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))







# awar_nr mothers

# Total probes is 706987.

mom_awar_nr_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>%
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(mom_awar_nr_results, by = c("probe" = "probe")) %>% 
  filter(probeType=="cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # This removes six more probes. 706981.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22","X")))

mom_awar_nr_man2 <- mom_awar_nr_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(mom_awar_nr_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- mom_awar_nr_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )


mom_awar_nr_man <- ggplot(mom_awar_nr_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(mom_awar_nr_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = mom_awar_nr_man2[mom_awar_nr_man2$p_log10 > 
   #                                      -log10(0.05/nrow(mom_awar_nr_man2)), ],
   #           box.padding = 0.6) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 23 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("War trauma in mothers") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))





# achronic mothers

# one probe failed. 706986 probes total.

mom_achronic_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>%
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(mom_achronic_results, by = c("probe" = "probe")) %>% 
  filter(probeType=="cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # This removes 6 probes. 706980 probes total.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22","X")))

mom_achronic_man2 <- mom_achronic_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(mom_achronic_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- mom_achronic_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )


mom_achronic_man <- ggplot(mom_achronic_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(mom_achronic_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = mom_achronic_man2[mom_achronic_man2$p_log10 > 
   #                                      -log10(0.05/nrow(mom_achronic_man2)), ],
   #           box.padding = 0.6) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 23 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("Chronic stress in mothers") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))


##########################
# Baby Manhattan Plots
#########################


# gtsum babies

# Total probes are 706987.

baby_gtsum_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>% 
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(baby_gtsum_results, by = c("probe" = "probe")) %>% 
  filter(probeType == "cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # This removes 6 probes
  filter(chrom != "X") %>% # This removes 15114 probes. Total left is 691867.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22")))

baby_gtsum_man2 <- baby_gtsum_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(baby_gtsum_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- baby_gtsum_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )



baby_gtsum_man <- ggplot(baby_gtsum_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(baby_gtsum_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = baby_gtsum_man2[baby_gtsum_man2$p_log10 > 
   #                                      -log10(0.05/nrow(baby_gtsum_man2)), ],
   #           box.padding = 0.6) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 22 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("General trauma in newborns") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))







# setot babies

baby_setot_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>%
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(baby_setot_results, by = c("probe" = "probe")) %>% 
  filter(probeType=="cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # 6 probes removed
  filter(chrom != "X") %>% # 15114 probes removed. 691867 probes total left.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22")))

baby_setot_man2 <- baby_setot_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(baby_setot_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- baby_setot_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )


baby_setot_man <- ggplot(baby_setot_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(baby_setot_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = baby_setot_man2[baby_setot_man2$p_log10 > 
   #                                      -log10(0.05/nrow(baby_setot_man2)), ],
   #           box.padding = 0.6, max.overlaps = Inf) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 22 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("Sexual trauma in newborns") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))







# awar_nr babies

# 706987 probes total.

baby_awar_nr_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>%
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(baby_awar_nr_results, by = c("probe" = "probe")) %>% 
  filter(probeType=="cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # 6 probes removed
  filter(chrom != "X") %>% # 15114 probes removed. Leaves 691867 probes.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22")))

baby_awar_nr_man2 <- baby_awar_nr_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(baby_awar_nr_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- baby_awar_nr_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )


baby_awar_nr_man <- ggplot(baby_awar_nr_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(baby_awar_nr_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = baby_awar_nr_man2[baby_awar_nr_man2$p_log10 > 
   #                                      -log10(0.05/nrow(baby_awar_nr_man2)), ],
   #           box.padding = 0.6) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 22 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
    ggtitle("War trauma in newborns") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))





# achronic babies

# 706987 probes total.

baby_achronic_man <- zhou %>% 
  select(probeID,CpG_chrm,CpG_end,probeType) %>%
  rename(probe = probeID, CHR = CpG_chrm, BP = CpG_end) %>%
  right_join(baby_achronic_results, by = c("probe" = "probe")) %>% 
  filter(probeType=="cg") %>% 
  mutate(chrom = str_remove(CHR,"chr")) %>% 
  arrange(chrom) %>% 
  filter(chrom != "Y") %>% # removes 6 probes
  filter(chrom != "X") %>% # removes 15114 probes. That leaves 691867 probes.
  mutate(chrom2 = factor(chrom,
                         ordered = TRUE,
                         levels = c("1","2","3","4","5","6","7",
                                    "8","9","10","11","12","13",
                                    "14","15","16","17","18","19",
                                    "20","21","22")))

baby_achronic_man2 <- baby_achronic_man %>%   
  # Compute chromosome size
  group_by(chrom2) %>% 
  summarise(chr_len=max(BP)) %>% 
  
  # Calculate cumulative position of each chromosome
  mutate(tot= cumsum(as.numeric(chr_len))-as.numeric(chr_len)) %>%
  select(-chr_len) %>%
  
  # Add this info to the initial dataset
  left_join(baby_achronic_man,., by= c("chrom2" = "chrom2")) %>%
  
  # Add a cumulative position of each SNP
  arrange(chrom2, BP) %>%
  mutate(BPcum=BP+tot)

axisdf <- baby_achronic_man2 %>% group_by(chrom2) %>% dplyr::summarize(center= (max(BPcum) + min(BPcum) ) / 2 )


baby_achronic_man <- ggplot(baby_achronic_man2, aes(x=BPcum, y=-log10(pval))) +
    
    # Show all points
    geom_point(aes(color=chrom2), alpha=0.7, size=1.1) +
    geom_hline(yintercept = -log10(0.05/nrow(baby_achronic_man2)), color = "black",
               lty = "dashed") +
  
   # geom_label_repel(aes(x= BPcum, y= p_log10, label = probe), 
   #           data = baby_achronic_man2[baby_achronic_man2$p_log10 > 
   #                                      -log10(0.05/nrow(baby_achronic_man2)), ],
   #           box.padding = 0.6) +
  
    scale_color_manual(values = rep(c("purple", "green3"), 22 )) +
    
    # custom X axis:
   scale_x_continuous( label = axisdf$chrom2, breaks= axisdf$center) +
   xlab("Chromosome") +
    # remove space between plot area and x axis
   scale_y_continuous(expand = c(0, 0), labels = seq(0,12,2), breaks = seq(0,12,2),
                      limits = c(0,12)) + 
   ylab("-log10 p value") +
  
    # Custom the theme:
    theme_bw() +
    theme( 
      legend.position="none",
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()) +
    ggtitle("Chronic stress in newborns") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  guides(x = guide_axis(angle = 90)) +
  theme(axis.text.x = element_text(size = 6))



###########################################################

# volcano plots for all eight analyses

mom_gtsum_volcano <- ggplot(mom_gtsum_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(mom_gtsum_man2$p_log10 > -log10(0.05/nrow(mom_gtsum_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(mom_gtsum_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = mom_gtsum_man2[mom_gtsum_man2$p_log10 > 
                                        -log10(0.05/nrow(mom_gtsum_man2)), ],
             box.padding = 0.6) +
  ggtitle("Congo Mothers General Trauma EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of general trauma on DNAm at a given probe") +
  ylab("-log10 p value")

mom_setot_volcano <- ggplot(mom_setot_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(mom_setot_man2$p_log10 > -log10(0.05/nrow(mom_setot_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(mom_setot_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = mom_setot_man2[mom_setot_man2$p_log10 > 
                                        -log10(0.05/nrow(mom_setot_man2)), ],
             box.padding = 0.6,
             max.overlaps  = Inf) +
  ggtitle("Congo Mothers Sexual Events EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of sexual trauma on DNAm at a given probe") +
  ylab("-log10 p value")

mom_achronic_volcano <- ggplot(mom_achronic_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(mom_achronic_man2$p_log10 > -log10(0.05/nrow(mom_achronic_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(mom_achronic_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = mom_achronic_man2[mom_achronic_man2$p_log10 > 
                                        -log10(0.05/nrow(mom_achronic_man2)), ],
             box.padding = 0.6,
             max.overlaps  = Inf) +
  ggtitle("Congo Mothers Chronic Stress EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of chronic stress on DNAm at a given probe") +
  ylab("-log10 p value")

mom_awar_nr_volcano <- ggplot(mom_awar_nr_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(mom_awar_nr_man2$p_log10 > -log10(0.05/nrow(mom_awar_nr_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(mom_awar_nr_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = mom_awar_nr_man2[mom_awar_nr_man2$p_log10 > 
                                        -log10(0.05/nrow(mom_awar_nr_man2)), ],
             box.padding = 0.6,
             max.overlaps  = Inf) +
  ggtitle("Congo Mothers War Stress EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of war stress on DNAm at a given probe") +
  ylab("-log10 p value")



baby_gtsum_volcano <- ggplot(baby_gtsum_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(baby_gtsum_man2$p_log10 > -log10(0.05/nrow(baby_gtsum_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(baby_gtsum_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = baby_gtsum_man2[baby_gtsum_man2$p_log10 > 
                                        -log10(0.05/nrow(baby_gtsum_man2)), ],
             box.padding = 0.6) +
  ggtitle("Congo Babies General Trauma EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of general trauma on DNAm at a given probe") +
  ylab("-log10 p value")

baby_setot_volcano <- ggplot(baby_setot_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(baby_setot_man2$p_log10 > -log10(0.05/nrow(baby_setot_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(baby_setot_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = baby_setot_man2[baby_setot_man2$p_log10 > 
                                        -log10(0.05/nrow(baby_setot_man2)), ],
             box.padding = 0.6,
             max.overlaps  = Inf) +
  ggtitle("Congo Babies Sexual Events EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of sexual trauma on DNAm at a given probe") +
  ylab("-log10 p value")

baby_achronic_volcano <- ggplot(baby_achronic_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(baby_achronic_man2$p_log10 > -log10(0.05/nrow(baby_achronic_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(baby_achronic_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = baby_achronic_man2[baby_achronic_man2$p_log10 > 
                                        -log10(0.05/nrow(baby_achronic_man2)), ],
             box.padding = 0.6,
             max.overlaps  = Inf) +
  ggtitle("Congo Babies Chronic Stress EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of chronic stress on DNAm at a given probe") +
  ylab("-log10 p value")

baby_awar_nr_volcano <- ggplot(baby_awar_nr_man2,aes(x= coef, y= p_log10, label = probe)) +
  theme_bw() +
  geom_point(alpha = 0.6, shape = 20, 
                   color = ifelse(baby_awar_nr_man2$p_log10 > -log10(0.05/nrow(baby_awar_nr_man2)),
                                  'green4','black')) +
  geom_hline(yintercept= -log10(0.05/nrow(baby_awar_nr_man2)), col="red") +
  geom_label_repel(aes(x= coef, y= p_log10, label = probe), 
             data = baby_awar_nr_man2[baby_awar_nr_man2$p_log10 > 
                                        -log10(0.05/nrow(baby_awar_nr_man2)), ],
             box.padding = 0.6,
             max.overlaps  = Inf) +
  ggtitle("Congo Babies War Stress EWAS") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(-0.1,0.1) +
  ylim(0,15) +
  xlab("Effect of war stress on DNAm at a given probe") +
  ylab("-log10 p value")




# Print all manhattan plots

# mom_gtsum_man
# mom_setot_man
# mom_awar_nr_man
# mom_achronic_man
# 
# baby_gtsum_man
# baby_setot_man
# baby_awar_nr_man
# baby_achronic_man


# Print all volcano plots

# mom_gtsum_volcano
# mom_setot_volcano
# mom_awar_nr_volcano
# mom_achronic_volcano
# 
# baby_gtsum_volcano
# baby_setot_volcano
# baby_awar_nr_volcano
# baby_achronic_volcano



Get the bonferroni significant cutoffs for each test.

cat("The bonferroni significance cutoffs for mothers are: ")
## The bonferroni significance cutoffs for mothers are:
cat("mothers general trauma",0.05/nrow(mom_gtsum_man2),"  ")
## mothers general trauma 7.072336e-08
cat("mothers sexual trauma",0.05/nrow(mom_setot_man2),"  ")
## mothers sexual trauma 7.072336e-08
cat("mothers war stress",0.05/nrow(mom_awar_nr_man2),"  ")
## mothers war stress 7.072326e-08
cat("mothers chronic stress",0.05/nrow(mom_achronic_man2),"  ")
## mothers chronic stress 7.072336e-08
cat("The bonferroni significance cutoffs for babies are: ")
## The bonferroni significance cutoffs for babies are:
cat("babies general trauma",0.05/nrow(baby_gtsum_man2),"  ")
## babies general trauma 7.226822e-08
cat("babies sexual trauma",0.05/nrow(baby_setot_man2),"  ")
## babies sexual trauma 7.226822e-08
cat("babies war stress",0.05/nrow(baby_awar_nr_man2),"  ")
## babies war stress 7.226822e-08
cat("babies chronic stress",0.05/nrow(baby_achronic_man2),"  ")
## babies chronic stress 7.226822e-08

Get the final number of probes in each test:

cat("The final numbers of probes in each test for mothers are: ")
## The final numbers of probes in each test for mothers are:
cat("mothers general trauma",nrow(mom_gtsum_man2),"  ")
## mothers general trauma 706980
cat("mothers sexual trauma",nrow(mom_setot_man2),"  ")
## mothers sexual trauma 706980
cat("mothers war stress",nrow(mom_awar_nr_man2),"  ")
## mothers war stress 706981
cat("mothers chronic stress",nrow(mom_achronic_man2),"  ")
## mothers chronic stress 706980
cat("The final numbers of probes in each test for babies are: ")
## The final numbers of probes in each test for babies are:
cat("babies general trauma",nrow(baby_gtsum_man2),"  ")
## babies general trauma 691867
cat("babies sexual trauma",nrow(baby_setot_man2),"  ")
## babies sexual trauma 691867
cat("babies war stress",nrow(baby_awar_nr_man2),"  ")
## babies war stress 691867
cat("babies chronic stress",nrow(baby_achronic_man2),"  ")
## babies chronic stress 691867



Plot the data for genome-wide significant sites.

# Get significant sites for those analyses with significant sites.

mom_sig_gtsum <- mom_gtsum_man2 %>% 
  filter(pval < (0.05/nrow(mom_gtsum_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("general_trauma")) %>% 
  mutate(generation = c("mother"))

mom_sig_setot <- mom_setot_man2 %>% 
  filter(pval < (0.05/nrow(mom_setot_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("sexual_events"))%>% 
  mutate(generation = c("mother"))

mom_sig_awar_nr <- mom_awar_nr_man2 %>% 
  filter(pval < (0.05/nrow(mom_awar_nr_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("war_stress"))%>% 
  mutate(generation = c("mother"))

mom_sig_achronic <- mom_achronic_man2 %>% 
  filter(pval < (0.05/nrow(mom_achronic_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("chronic_stress"))%>% 
  mutate(generation = c("mother"))

baby_sig_gtsum <- baby_gtsum_man2 %>% 
  filter(pval < (0.05/nrow(baby_gtsum_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("general_trauma")) %>% 
  mutate(generation = c("baby"))

baby_sig_setot <- baby_setot_man2 %>% 
  filter(pval < (0.05/nrow(baby_setot_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("sexual_events"))%>% 
  mutate(generation = c("baby"))

baby_sig_awar_nr <- baby_awar_nr_man2 %>% 
  filter(pval < (0.05/nrow(baby_awar_nr_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("war_stress"))%>% 
  mutate(generation = c("baby"))

baby_sig_achronic <- baby_achronic_man2 %>% 
  filter(pval < (0.05/nrow(baby_achronic_man2))) %>% 
  select(probe,pval) %>% 
  mutate(exposure = c("chronic_stress")) %>% 
  mutate(generation = c("baby"))

mom_all_sig <- bind_rows(mom_sig_gtsum,
                         mom_sig_setot,
                         mom_sig_awar_nr,
                         mom_sig_achronic)

baby_all_sig <- bind_rows(baby_sig_gtsum,
                         baby_sig_setot,
                         baby_sig_awar_nr,
                         baby_sig_achronic)


# Any overlapping and significant sites? Nope.

table(baby_all_sig$probe %in% mom_all_sig$probe)
## 
## FALSE 
##    11
# Are all the probes in each list unique, or do
# they come up with multiple exposures?

length(mom_all_sig$probe) == length(unique(mom_all_sig$probe))
## [1] TRUE
length(baby_all_sig$probe) == length(unique(baby_all_sig$probe))
## [1] TRUE
# All significant sites are unique. They do not show up in
# any other analyses as significant hits.

# Scatter plots of raw data for significant sites

# subset mom_betas and add to dfm


# Figure out how to subset the significant
# probes for the exposure they are associated with.

dfm_sig <- as.data.frame(mom_betas) %>%
  select(any_of(mom_all_sig$probe)) %>%
  rownames_to_column(var = "methylation_id")

dfm_sig_gtsum <- dfm_sig %>% 
  select(methylation_id, any_of(
  mom_all_sig[mom_all_sig$exposure=="general_trauma",]$probe)) %>% 
  right_join(dfm, by = c("methylation_id" = "methylation_id"))

dfm_sig_setot <- dfm_sig %>% 
  select(methylation_id, any_of(
  mom_all_sig[mom_all_sig$exposure=="sexual_events",]$probe)) %>% 
  right_join(dfm, by = c("methylation_id" = "methylation_id"))

dfm_sig_awar_nr <- dfm_sig %>% 
  select(methylation_id, any_of(
  mom_all_sig[mom_all_sig$exposure=="war_stress",]$probe)) %>% 
  right_join(dfm, by = c("methylation_id" = "methylation_id"))



# scatter plot mothers gtsum
mom_gtsum_scatter <- 
  dfm_sig_gtsum %>%
  select(gtsum,cohort,starts_with("cg")) %>%
  pivot_longer(3:ncol(.), names_to = "CpG", values_to = "beta") %>%
  ggplot(aes(x = gtsum, y = beta, color = cohort)) +
  geom_point(alpha = 0.5, position = position_jitter(w = 0.1)) +
  geom_smooth(method = "lm", se = F) +
  facet_wrap(~ CpG, scales = "free") +
  ggtitle("General Trauma - Mothers - Genome Wide Significant Sites",
          subtitle = "Raw data are plotted") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw() +
  scale_color_brewer(palette = "Dark2") +
  labs(color = "Cohort")


# scatter plot mothers setot
mom_setot_scatter <- 
  dfm_sig_setot %>%
  select(setot,cohort,starts_with("cg")) %>%
  pivot_longer(3:ncol(.), names_to = "CpG", values_to = "beta") %>%
  ggplot(aes(x = setot, y = beta, color = cohort)) +
  geom_point(alpha = 0.5, position = position_jitter(w = 0.1)) +
  geom_smooth(method = "lm", se = F) +
  facet_wrap(~ CpG, scales = "free") +
  ggtitle("Sexual violence - Mothers - Genome Wide Significant Sites",
          subtitle = "Raw data are plotted") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw() +
  scale_color_brewer(palette = "Dark2") +
  labs(color = "Cohort")

# scatter plot mothers awar_nr
mom_awar_nr_scatter <- 
  dfm_sig_awar_nr %>%
  select(awar_nr,cohort,starts_with("cg")) %>%
  pivot_longer(3:ncol(.), names_to = "CpG", values_to = "beta") %>%
  ggplot(aes(x = awar_nr, y = beta, color = cohort)) +
  geom_point(alpha = 0.5, position = position_jitter(w = 0.1)) +
  geom_smooth(method = "lm", se = F) +
  facet_wrap(~ CpG, scales = "free") +
  ggtitle("War Stress - Mothers - Genome Wide Significant Sites",
          subtitle = "Raw data are plotted") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw() +
  scale_color_brewer(palette = "Dark2") +
  labs(color = "Cohort")

# No significant hits for chronic stress


######################################
# Babies scatter plots for significant
# hits.
######################################


# subset baby_betas and add to dfb

dfb_sig <- as.data.frame(baby_betas) %>%
  select(any_of(baby_all_sig$probe)) %>%
  rownames_to_column(var = "methylation_id")

dfb_sig_gtsum <- dfb_sig %>% 
  select(methylation_id, any_of(
  baby_all_sig[baby_all_sig$exposure=="general_trauma",]$probe)) %>% 
  right_join(dfb, by = c("methylation_id" = "methylation_id"))

dfb_sig_setot <- dfb_sig %>% 
  select(methylation_id, any_of(
  baby_all_sig[baby_all_sig$exposure=="sexual_events",]$probe)) %>% 
  right_join(dfb, by = c("methylation_id" = "methylation_id"))


dfb_sig_awar_nr <- dfb_sig %>% 
  select(methylation_id, any_of(
  baby_all_sig[baby_all_sig$exposure=="war_stress",]$probe)) %>% 
  right_join(dfb, by = c("methylation_id" = "methylation_id"))


# scatter plot Babies gtsum
baby_gtsum_scatter <- 
  dfb_sig_gtsum %>%
  select(gtsum,cohort,starts_with("cg")) %>%
  pivot_longer(3:ncol(.), names_to = "CpG", values_to = "beta") %>%
  ggplot(aes(x = gtsum, y = beta, color = cohort)) +
  geom_point(alpha = 0.5, position = position_jitter(w = 0.1)) +
  geom_smooth(method = "lm", se = F) +
  facet_wrap(~ CpG, scales = "free") +
  ggtitle("General Trauma - Babies - Genome Wide Significant Sites",
          subtitle = "Raw data are plotted") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw() +
  scale_color_brewer(palette = "Dark2") +
  labs(color = "Cohort")


# scatter plot Babies setot
baby_setot_scatter <- 
  dfb_sig_setot %>%
  select(setot,cohort,starts_with("cg")) %>%
  pivot_longer(3:ncol(.), names_to = "CpG", values_to = "beta") %>%
  ggplot(aes(x = setot, y = beta, color = cohort)) +
  geom_point(alpha = 0.5, position = position_jitter(w = 0.1)) +
  geom_smooth(method = "lm", se = F) +
  facet_wrap(~ CpG, scales = "free") +
  ggtitle("Sexual Events - Babies - Genome Wide Significant Sites",
          subtitle = "Raw data are plotted") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw() +
  scale_color_brewer(palette = "Dark2") +
  labs(color = "Cohort")


# scatter plot Babies awar_nr
baby_awar_nr_scatter <- 
  dfb_sig_awar_nr %>%
  select(awar_nr,cohort,starts_with("cg")) %>%
  pivot_longer(3:ncol(.), names_to = "CpG", values_to = "beta") %>%
  ggplot(aes(x = awar_nr, y = beta, color = cohort)) +
  geom_point(alpha = 0.5, position = position_jitter(w = 0.1)) +
  geom_smooth(method = "lm", se = F) +
  facet_wrap(~ CpG, scales = "free") +
  ggtitle("War Stress - Babies - Genome Wide Significant Sites",
          subtitle = "Raw data are plotted") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_bw() +
  scale_color_brewer(palette = "Dark2") +
  labs(color = "Cohort")




  zhou2 %>% 
  select(probeID,distToTSS,CGIposition) %>% 
  right_join(mom_all_sig, by = c("probeID" = "probe")) %>% 
  select(probeID,pval,exposure,generation,CGIposition,distToTSS) %>% 
  left_join(illumina2, by = c("probeID")) %>% 
  arrange(exposure) %>% 
  kable() %>% 
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover"))
probeID pval exposure generation CGIposition distToTSS gene_context
cg11408019 0e+00 general_trauma mother Island NA
cg14519777 1e-07 general_trauma mother NA 2305;4799;6420;6414;6417;6404;6521;6465;4791;6416;4785;4785;4785;3504;6417;6418;6471;6471 Body;Body;Body
cg14282695 0e+00 general_trauma mother NA 123507;123507;123991;3098;38436;123487;3099;3122;123507 Body;Body
cg16543391 1e-07 general_trauma mother N_Shelf -76;5444;4254;-78;-369;6129;-72;-82;-76;2908;-70;-600;-76;-231;658;-386 TSS1500;TSS200;TSS200;Body;Body
cg21219607 1e-07 sexual_events mother NA -354;37611;20737;37633;-354 TSS1500;TSS1500;Body;Body
cg06308131 0e+00 sexual_events mother S_Shore 26967;26967;60014;60014;26967;26967;26967;60046;60086;2151;60046;677;1260;60046;60046;60046;60046;60046;60046;60046 Body;Body;Body
cg04358942 0e+00 sexual_events mother NA 39109;-539
cg23527517 0e+00 sexual_events mother NA 5801 Body
cg14859642 1e-07 sexual_events mother NA NA
cg00489624 0e+00 sexual_events mother NA 101087;40319;101132 5’UTR
cg24308336 0e+00 sexual_events mother N_Shelf 15350;40412 Body
cg16765764 0e+00 sexual_events mother NA 24584;24584;24629;24568;24578 Body;Body;Body
cg10897169 1e-07 sexual_events mother NA 74092;73992;73916;73915;73955 Body;Body;Body
cg13740840 1e-07 war_stress mother Island 18576;-189;-213;-205 TSS1500;TSS200
cg26486174 0e+00 war_stress mother Island NA
zhou2 %>% 
  select(probeID,distToTSS,CGIposition) %>% 
  right_join(baby_all_sig, by = c("probeID" = "probe")) %>% 
  select(probeID,pval,exposure,generation,CGIposition,distToTSS) %>% 
  left_join(illumina2, by = c("probeID")) %>% 
  arrange(exposure) %>% 
  kable() %>% 
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover"))
probeID pval exposure generation CGIposition distToTSS gene_context
cg24590750 0e+00 general_trauma baby Island -1200;-1120;-1120;-1091;-66;-182;-66 TSS1500;TSS200
cg10783680 1e-07 general_trauma baby Island -58;-7;-10;-7;-19;-61;-527;-483;-230;34;17823;-58;-61 5’UTR;1stExon;5’UTR;1stExon;5’UTR;1stExon;5’UTR;1stExon;1stExon;5’UTR;TSS200
cg10338475 0e+00 sexual_events baby S_Shore -598;-598;997 TSS1500;Body
cg11386818 0e+00 sexual_events baby Island 1158;1053;209;1089;1145;1130;1945;2034;341;1057;1136;2007;2230;1959;1139;1091;1118;1145;1178;2210 TSS1500;5’UTR;5’UTR;5’UTR;TSS1500;5’UTR
cg02176407 1e-07 sexual_events baby Island 304 1stExon
cg20807701 0e+00 sexual_events baby N_Shore -677;-670;470;257;473;510;554;60;401;492;449 Body;TSS1500;5’UTR;1stExon
cg09631059 0e+00 sexual_events baby N_Shore 2854;9745;14118;14090;12496;12861;14001;14130;8132;12825;9745;9745;12789;13603;9745;7126;13627;9745 5’UTR;Body;Body;Body;Body;Body;Body
cg06873316 0e+00 sexual_events baby Island -435;-415;-450;-525 TSS1500;TSS1500
cg08985979 0e+00 war_stress baby S_Shore 2736;2773;3078 Body
cg21172322 0e+00 war_stress baby N_Shore -135;46928;46913;123;902;46870;215;46876 Body
cg00741900 0e+00 war_stress baby Island 109;-1067;-1084;-1056;-1048;-1037;-1153;-1056;-1056;-1028;-1037 5’UTR;1stExon;TSS1500

Test for interactions by cohort and stressor for genome-wide significant hits.

mom_gtsum_int <- lapply(dfm_sig_gtsum[grep("cg",colnames(dfm_sig_gtsum))], function(x){
  
  mod_sum(rlm(x ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells +
                      gtsum*cohort,
                      data = dfm_sig_gtsum))
})

mom_setot_int <- lapply(dfm_sig_setot[grep("cg",colnames(dfm_sig_setot))], function(x){
  
  mod_sum(rlm(x ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells +
                      setot*cohort,
                      data = dfm_sig_setot))
})

mom_awar_nr_int <- lapply(dfm_sig_awar_nr[grep("cg",colnames(dfm_sig_awar_nr))], function(x){
  
  mod_sum(rlm(x ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells +
                      awar_nr*cohort,
                      data = dfm_sig_awar_nr))
})


######################################
# Test for interactions among babies
# between cohort and maternal stress
######################################

baby_gtsum_int <- lapply(dfb_sig_gtsum[grep("cg",colnames(dfb_sig_gtsum))], function(x){
  
  mod_sum(rlm(x ~ age + bmi + parous + pcsec + palco + sex + ga_meth + cohort + PC1_cells + PC2_cells +
                      gtsum*cohort,
                      data = dfb_sig_gtsum))
})


baby_setot_int <- lapply(dfb_sig_setot[grep("cg",colnames(dfb_sig_setot))], function(x){
  
  mod_sum(rlm(x ~ age + bmi + parous + pcsec + palco + sex + ga_meth + cohort + PC1_cells + PC2_cells +
                      setot*cohort,
                      data = dfb_sig_setot))
})




baby_awar_nr_int <- lapply(dfb_sig_awar_nr[grep("cg",colnames(dfb_sig_awar_nr))], function(x){
  
  mod_sum(rlm(x ~ age + bmi + parous + pcsec + palco + sex + ga_meth + cohort + PC1_cells + PC2_cells +
                      awar_nr*cohort,
                      data = dfb_sig_awar_nr))
})

Note that no significant interactions were found for maternal stress and cohort for the significant probes. The effect is statistically the same across cohorts.



Predicted Values

mom_gtsum_pred <- lapply(dfm_sig_gtsum[grep("cg",colnames(dfm_sig_gtsum))], function(x){
  
  prediction(rlm(x ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells +
                      gtsum,
                      data = dfm_sig_gtsum), vcov. = sandwich)
})

mom_gtsum_pred_plots <- lapply(mom_gtsum_pred, function(x) {
  
  ggplot(x, aes(x = gtsum, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    xlab("General Trauma") +
    ggtitle("Predicted methylation among Mothers")
})

# Add individual y labels:

mom_gtsum_pred_plots$cg11408019 <- mom_gtsum_pred_plots$cg11408019 + ylab("DNAm at cg11408019")
mom_gtsum_pred_plots$cg14519777 <- mom_gtsum_pred_plots$cg14519777 + ylab("DNAm at cg14519777")
mom_gtsum_pred_plots$cg14282695 <- mom_gtsum_pred_plots$cg14282695 + ylab("DNAm at cg14282695")
mom_gtsum_pred_plots$cg16543391 <- mom_gtsum_pred_plots$cg16543391 + ylab("DNAm at cg16543391")



mom_setot_pred <- lapply(dfm_sig_setot[grep("cg",colnames(dfm_sig_setot))], function(x){
  
  prediction(rlm(x ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells +
                      setot,
                      data = dfm_sig_setot), vcov. = sandwich)
})

mom_setot_pred_plots <- lapply(mom_setot_pred, function(x) {
  
  ggplot(x, aes(x = setot, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    xlab("Sexual Events") +
    ggtitle("Predicted methylation among Mothers")
  
  
})


# Add individual y labels:

mom_setot_pred_plots$cg21219607 <- mom_setot_pred_plots$cg21219607 + ylab("DNAm at cg21219607")
mom_setot_pred_plots$cg06308131 <- mom_setot_pred_plots$cg06308131 + ylab("DNAm at cg06308131")
mom_setot_pred_plots$cg04358942 <- mom_setot_pred_plots$cg04358942 + ylab("DNAm at cg04358942")
mom_setot_pred_plots$cg23527517 <- mom_setot_pred_plots$cg23527517 + ylab("DNAm at cg23527517")
mom_setot_pred_plots$cg14859642 <- mom_setot_pred_plots$cg14859642 + ylab("DNAm at cg14859642")
mom_setot_pred_plots$cg00489624 <- mom_setot_pred_plots$cg00489624 + ylab("DNAm at cg00489624")
mom_setot_pred_plots$cg24308336 <- mom_setot_pred_plots$cg24308336 + ylab("DNAm at cg24308336")
mom_setot_pred_plots$cg16765764 <- mom_setot_pred_plots$cg16765764 + ylab("DNAm at cg16765764")
mom_setot_pred_plots$cg10897169 <- mom_setot_pred_plots$cg10897169 + ylab("DNAm at cg10897169")



mom_awar_nr_pred <- lapply(dfm_sig_awar_nr[grep("cg",colnames(dfm_sig_awar_nr))], function(x){
  
  prediction(rlm(x ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells +
                      awar_nr,
                      data = dfm_sig_awar_nr), vcov. = sandwich)
})

mom_awar_nr_pred_plots <- lapply(mom_awar_nr_pred, function(x) {
  
  ggplot(x, aes(x = awar_nr, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    xlab("War Stress") +
    ggtitle("Predicted methylation among Mothers")
  
  
})


mom_awar_nr_pred_plots$cg13740840 <- mom_awar_nr_pred_plots$cg13740840 + 
  ylab("DNAm at cg13740840")
mom_awar_nr_pred_plots$cg26486174 <- mom_awar_nr_pred_plots$cg26486174 + 
  ylab("DNAm at cg26486174")


##############################
# Babies predicted methylation
# plots
##############################

baby_gtsum_pred <- lapply(dfb_sig_gtsum[grep("cg",colnames(dfb_sig_gtsum))], function(x){
  
  prediction(rlm(x ~ age + bmi + parous + pcsec + palco + sex + ga_meth + cohort + PC1_cells + PC2_cells +
                      gtsum,
                      data = dfb_sig_gtsum), vcov. = sandwich)
})

baby_gtsum_pred_plots <- lapply(baby_gtsum_pred, function(x) {
  
  ggplot(x, aes(x = gtsum, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    xlab("General Trauma") +
    ggtitle("Predicted methylation among babies")
})


# Add y-axis labels:
baby_gtsum_pred_plots$cg24590750 <- baby_gtsum_pred_plots$cg24590750 + ylab("DNAm at cg24590750")
baby_gtsum_pred_plots$cg10783680 <- baby_gtsum_pred_plots$cg10783680 + ylab("DNAm at cg10783680")


baby_setot_pred <- lapply(dfb_sig_setot[grep("cg",colnames(dfb_sig_setot))], function(x){
  
  prediction(rlm(x ~ age + bmi + parous + pcsec + palco + sex + ga_meth + cohort + PC1_cells + PC2_cells +
                      setot,
                      data = dfb_sig_setot), vcov. = sandwich)
})

baby_setot_pred_plots <- lapply(baby_setot_pred, function(x) {
  
  ggplot(x, aes(x = setot, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    xlab("Sexual Violence") +
    ggtitle("Predicted methylation among babies")
})


# Add y-axis labels:
baby_setot_pred_plots$cg10338475 <- baby_setot_pred_plots$cg10338475 + ylab("DNAm at cg10338475")
baby_setot_pred_plots$cg11386818 <- baby_setot_pred_plots$cg11386818 + ylab("DNAm at cg11386818")
baby_setot_pred_plots$cg02176407 <- baby_setot_pred_plots$cg02176407 + ylab("DNAm at cg02176407")
baby_setot_pred_plots$cg20807701 <- baby_setot_pred_plots$cg20807701 + ylab("DNAm at cg20807701")
baby_setot_pred_plots$cg09631059 <- baby_setot_pred_plots$cg09631059 + ylab("DNAm at cg09631059")
baby_setot_pred_plots$cg06873316 <- baby_setot_pred_plots$cg06873316 + ylab("DNAm at cg06873316")


baby_awar_nr_pred <- lapply(dfb_sig_awar_nr[grep("cg",colnames(dfb_sig_awar_nr))], function(x){
  
  prediction(rlm(x ~ age + bmi + parous + pcsec + palco + sex + ga_meth + cohort + PC1_cells + PC2_cells +
                      awar_nr,
                      data = dfb_sig_awar_nr), vcov. = sandwich)
})

baby_awar_nr_pred_plots <- lapply(baby_awar_nr_pred, function(x) {
  
  ggplot(x, aes(x = awar_nr, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    xlab("General Trauma") +
    ggtitle("Predicted methylation among babies")
  
  
})


# Add y-axis labels:
baby_awar_nr_pred_plots$cg08985979 <- baby_awar_nr_pred_plots$cg08985979 + 
  ylab("DNAm at cg08985979")
baby_awar_nr_pred_plots$cg21172322 <- baby_awar_nr_pred_plots$cg21172322 + 
  ylab("DNAm at cg21172322")
baby_awar_nr_pred_plots$cg00741900 <- baby_awar_nr_pred_plots$cg00741900 + 
  ylab("DNAm at cg00741900")



# mom_gtsum_pred_plots
# mom_setot_pred_plots
# mom_awar_nr_pred_plots


# baby_gtsum_pred_plots
# baby_setot_pred_plots
# baby_awar_nr_pred_plots

methylation PCs

Are any of the top 10 methylation PCs associated with maternal stress in a multivariate analysis?

mom_pc_tests <- dfm %>%
  select(methylation_id,age,parous,pcsec,palco,ga_meth,cohort,
         PC1_cells,bmi,gtsum,setot,awar_nr,achronic,PC1:PC10)

mom_pc_gtsum_res <- apply(mom_pc_tests[14:ncol(mom_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + cohort +
                      gtsum,
                      data = mom_pc_tests))
  
}) # Association with PC6 only, p = 0.042.



mom_pc_setot_res <- apply(mom_pc_tests[14:ncol(mom_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + cohort +
                      setot,
                      data = mom_pc_tests))
  
}) 


mom_pc_awar_nr_res <- apply(mom_pc_tests[14:ncol(mom_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + cohort +
                      awar_nr,
                      data = mom_pc_tests))
  
}) 



mom_pc_achronic_res <- apply(mom_pc_tests[14:ncol(mom_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + cohort +
                      achronic,
                      data = mom_pc_tests))
  
}) 


baby_pc_tests <- dfb %>%
  select(methylation_id,age,parous,pcsec,sex,palco,ga_meth,cohort,bmi,
         PC1_cells,PC2_cells,gtsum,setot,awar_nr,achronic,PC1:PC10)



baby_pc_gtsum_res <- apply(baby_pc_tests[16:ncol(baby_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + sex + parous + pcsec + palco + cohort + 
               PC1_cells + PC2_cells + ga_meth + gtsum,
                      data = baby_pc_tests))
  
}) 



baby_pc_setot_res <- apply(baby_pc_tests[16:ncol(baby_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + sex + parous + pcsec + palco + cohort + 
               PC1_cells + PC2_cells + ga_meth + setot,
                      data = baby_pc_tests))
  
}) 


baby_pc_awar_nr_res <- apply(baby_pc_tests[16:ncol(baby_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + sex + parous + pcsec + palco + cohort + 
               PC1_cells + PC2_cells + ga_meth + awar_nr,
                      data = baby_pc_tests))
  
}) 



baby_pc_achronic_res <- apply(baby_pc_tests[16:ncol(baby_pc_tests)],2, function(x){
  
  summary(lm(x ~ age + bmi + sex + parous + pcsec + palco + cohort + 
               PC1_cells + PC2_cells + ga_meth + achronic,
                      data = baby_pc_tests))
  
}) 

GMM analysis

Is general mean methylation associated with maternal stress in a multivariate analysis?

mom_gmm <- rowMeans(mom_betas, na.rm = T)
mom_gmm <- cbind.data.frame(names(mom_gmm),mom_gmm) 
colnames(mom_gmm) <- c("methylation_id","gmm")

dfm <- dfm %>% 
  left_join(mom_gmm, by = c("methylation_id" = "methylation_id"))

model_gmm_mom_gtsum <- lm(gmm ~ bmi + age + pcsec + palco + parous + cohort + PC1_cells +
           gtsum, data = dfm)
summary(model_gmm_mom_gtsum)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + cohort + 
##     PC1_cells + gtsum, data = dfm)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0079520 -0.0017509  0.0001806  0.0017179  0.0070797 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.200e-01  2.548e-03 243.293   <2e-16 ***
## bmi          1.726e-05  7.195e-05   0.240    0.811    
## age         -3.705e-05  5.836e-05  -0.635    0.527    
## pcsec        4.712e-05  6.209e-04   0.076    0.940    
## palco       -2.067e-04  7.353e-04  -0.281    0.779    
## parous      -2.865e-04  7.781e-04  -0.368    0.713    
## cohortSV    -6.775e-04  6.812e-04  -0.995    0.322    
## PC1_cells   -3.313e-03  2.263e-03  -1.464    0.146    
## gtsum        3.898e-05  1.471e-04   0.265    0.791    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002974 on 136 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.03119,    Adjusted R-squared:  -0.0258 
## F-statistic: 0.5473 on 8 and 136 DF,  p-value: 0.819
model_gmm_mom_setot <- lm(gmm ~ bmi + age + pcsec + palco + parous + cohort + PC1_cells +
           setot, data = dfm)
summary(model_gmm_mom_setot)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + cohort + 
##     PC1_cells + setot, data = dfm)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0080284 -0.0018798  0.0003063  0.0018082  0.0069252 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.207e-01  2.560e-03 242.480   <2e-16 ***
## bmi          1.678e-05  7.161e-05   0.234    0.815    
## age         -5.171e-05  5.936e-05  -0.871    0.385    
## pcsec        1.151e-05  6.187e-04   0.019    0.985    
## palco       -1.891e-04  7.317e-04  -0.258    0.796    
## parous      -3.638e-04  7.774e-04  -0.468    0.641    
## cohortSV    -6.072e-04  6.798e-04  -0.893    0.373    
## PC1_cells   -2.800e-03  2.294e-03  -1.221    0.224    
## setot       -1.901e-04  1.637e-04  -1.161    0.248    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002961 on 136 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.04021,    Adjusted R-squared:  -0.01625 
## F-statistic: 0.7121 on 8 and 136 DF,  p-value: 0.6805
model_gmm_mom_awar_nr <- lm(gmm ~ bmi + age + pcsec + palco + parous + cohort + PC1_cells +
           awar_nr, data = dfm)
summary(model_gmm_mom_awar_nr)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + cohort + 
##     PC1_cells + awar_nr, data = dfm)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.007961 -0.001770  0.000055  0.001850  0.006991 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.199e-01  2.548e-03 243.287   <2e-16 ***
## bmi          2.020e-05  7.217e-05   0.280    0.780    
## age         -3.825e-05  5.835e-05  -0.656    0.513    
## pcsec        2.007e-05  6.234e-04   0.032    0.974    
## palco       -2.130e-04  7.345e-04  -0.290    0.772    
## parous      -3.085e-04  7.799e-04  -0.396    0.693    
## cohortSV    -6.636e-04  6.817e-04  -0.973    0.332    
## PC1_cells   -3.347e-03  2.263e-03  -1.479    0.142    
## awar_nr      1.215e-04  2.711e-04   0.448    0.655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002973 on 136 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.03212,    Adjusted R-squared:  -0.02482 
## F-statistic: 0.5641 on 8 and 136 DF,  p-value: 0.8057
model_gmm_mom_achronic <- lm(gmm ~ bmi + age + pcsec + palco + parous + cohort + PC1_cells +
           achronic, data = dfm)
summary(model_gmm_mom_achronic)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + cohort + 
##     PC1_cells + achronic, data = dfm)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0081353 -0.0016530  0.0003181  0.0016814  0.0068304 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.212e-01  2.656e-03 233.841   <2e-16 ***
## bmi          6.822e-06  7.206e-05   0.095    0.925    
## age         -4.825e-05  5.868e-05  -0.822    0.412    
## pcsec       -7.591e-05  6.253e-04  -0.121    0.904    
## palco       -1.847e-04  7.312e-04  -0.253    0.801    
## parous      -4.045e-04  7.799e-04  -0.519    0.605    
## cohortSV    -3.006e-04  7.448e-04  -0.404    0.687    
## PC1_cells   -3.024e-03  2.262e-03  -1.337    0.184    
## achronic    -1.019e-04  8.184e-05  -1.245    0.215    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.002958 on 136 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.04162,    Adjusted R-squared:  -0.01476 
## F-statistic: 0.7382 on 8 and 136 DF,  p-value: 0.6576
baby_gmm <- rowMeans(baby_betas, na.rm = T)
baby_gmm <- cbind.data.frame(names(baby_gmm),baby_gmm) 
colnames(baby_gmm) <- c("methylation_id","gmm")

dfb <- dfb %>% 
  left_join(baby_gmm, by = c("methylation_id" = "methylation_id"))

model_gmm_baby_gtsum <- lm(gmm ~ bmi + age + pcsec + palco + parous + sex + ga_meth +
                             cohort +  PC1_cells +  PC2_cells + gtsum, data = dfb)
summary(model_gmm_baby_gtsum)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + sex + 
##     ga_meth + cohort + PC1_cells + PC2_cells + gtsum, data = dfb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0139586 -0.0019797 -0.0000295  0.0021322  0.0077988 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.258e-01  1.200e-02  52.143  < 2e-16 ***
## bmi         -6.690e-05  8.655e-05  -0.773 0.440913    
## age         -3.495e-06  7.006e-05  -0.050 0.960291    
## pcsec       -9.954e-04  7.313e-04  -1.361 0.175752    
## palco        3.125e-03  8.865e-04   3.525 0.000582 ***
## parous       2.206e-04  9.138e-04   0.241 0.809601    
## sexM        -1.512e-03  6.221e-04  -2.431 0.016401 *  
## ga_meth      3.859e-06  4.309e-05   0.090 0.928785    
## cohortSV    -1.296e-03  8.428e-04  -1.538 0.126407    
## PC1_cells   -5.943e-02  2.196e-03 -27.059  < 2e-16 ***
## PC2_cells   -5.368e-02  3.113e-03 -17.244  < 2e-16 ***
## gtsum        1.680e-04  1.804e-04   0.931 0.353526    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00353 on 133 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.9072, Adjusted R-squared:  0.8996 
## F-statistic: 118.3 on 11 and 133 DF,  p-value: < 2.2e-16
model_gmm_baby_setot <- lm(gmm ~ bmi + age + pcsec + palco + parous + sex + ga_meth +
                             cohort +  PC1_cells +  PC2_cells + setot, data = dfb)
summary(model_gmm_baby_setot)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + sex + 
##     ga_meth + cohort + PC1_cells + PC2_cells + setot, data = dfb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0136671 -0.0021212 -0.0001054  0.0021838  0.0079436 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.272e-01  1.195e-02  52.501  < 2e-16 ***
## bmi         -6.695e-05  8.672e-05  -0.772 0.441485    
## age          3.896e-06  7.187e-05   0.054 0.956854    
## pcsec       -9.474e-04  7.330e-04  -1.293 0.198398    
## palco        3.071e-03  8.879e-04   3.458 0.000731 ***
## parous       2.612e-04  9.168e-04   0.285 0.776147    
## sexM        -1.456e-03  6.187e-04  -2.352 0.020118 *  
## ga_meth     -1.118e-06  4.317e-05  -0.026 0.979374    
## cohortSV    -1.389e-03  8.451e-04  -1.644 0.102533    
## PC1_cells   -5.931e-02  2.218e-03 -26.744  < 2e-16 ***
## PC2_cells   -5.331e-02  3.078e-03 -17.322  < 2e-16 ***
## setot        1.168e-04  1.947e-04   0.600 0.549632    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003536 on 133 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.9069, Adjusted R-squared:  0.8992 
## F-statistic: 117.8 on 11 and 133 DF,  p-value: < 2.2e-16
model_gmm_baby_awar_nr <- lm(gmm ~ bmi + age + pcsec + palco + parous + sex + ga_meth +
                             cohort +  PC1_cells +  PC2_cells + awar_nr, data = dfb)
summary(model_gmm_baby_awar_nr)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + sex + 
##     ga_meth + cohort + PC1_cells + PC2_cells + awar_nr, data = dfb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0134419 -0.0020071 -0.0000481  0.0021228  0.0077329 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.251e-01  1.208e-02  51.759  < 2e-16 ***
## bmi         -5.740e-05  8.711e-05  -0.659 0.511076    
## age         -1.013e-05  7.015e-05  -0.144 0.885390    
## pcsec       -1.073e-03  7.377e-04  -1.454 0.148327    
## palco        3.119e-03  8.857e-04   3.521 0.000589 ***
## parous       1.430e-04  9.175e-04   0.156 0.876410    
## sexM        -1.517e-03  6.217e-04  -2.441 0.015972 *  
## ga_meth      6.304e-06  4.330e-05   0.146 0.884459    
## cohortSV    -1.268e-03  8.444e-04  -1.501 0.135682    
## PC1_cells   -5.933e-02  2.199e-03 -26.975  < 2e-16 ***
## PC2_cells   -5.352e-02  3.081e-03 -17.374  < 2e-16 ***
## awar_nr      3.283e-04  3.280e-04   1.001 0.318680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003528 on 133 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.9073, Adjusted R-squared:  0.8997 
## F-statistic: 118.4 on 11 and 133 DF,  p-value: < 2.2e-16
model_gmm_baby_achronic <- lm(gmm ~ bmi + age + pcsec + palco + parous + sex + ga_meth +
                             cohort + PC1_cells +  PC2_cells + achronic, data = dfb)
summary(model_gmm_baby_achronic)
## 
## Call:
## lm(formula = gmm ~ bmi + age + pcsec + palco + parous + sex + 
##     ga_meth + cohort + PC1_cells + PC2_cells + achronic, data = dfb)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.013766 -0.002232  0.000072  0.002145  0.008040 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.273e-01  1.202e-02  52.174  < 2e-16 ***
## bmi         -6.925e-05  8.712e-05  -0.795 0.428100    
## age         -7.907e-06  7.109e-05  -0.111 0.911603    
## pcsec       -9.867e-04  7.374e-04  -1.338 0.183131    
## palco        3.090e-03  8.884e-04   3.478 0.000683 ***
## parous       2.079e-04  9.222e-04   0.225 0.821956    
## sexM        -1.429e-03  6.196e-04  -2.307 0.022595 *  
## ga_meth      5.693e-07  4.312e-05   0.013 0.989484    
## cohortSV    -1.268e-03  9.176e-04  -1.382 0.169232    
## PC1_cells   -5.944e-02  2.207e-03 -26.937  < 2e-16 ***
## PC2_cells   -5.308e-02  3.076e-03 -17.258  < 2e-16 ***
## achronic    -2.136e-05  9.481e-05  -0.225 0.822100    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00354 on 133 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.9067, Adjusted R-squared:  0.899 
## F-statistic: 117.5 on 11 and 133 DF,  p-value: < 2.2e-16

Is general mean methylation associated with birth weight in a multivariate analysis?

model_gmm_mom_bwgt <- lm(bwgt ~ bmi + age + pcsec + palco + parous + cohort + PC1_cells + ga_meth + sex + gmm, data = dfm)
summary(model_gmm_mom_bwgt)
## 
## Call:
## lm(formula = bwgt ~ bmi + age + pcsec + palco + parous + cohort + 
##     PC1_cells + ga_meth + sex + gmm, data = dfm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -951.98 -262.90  -41.36  269.98 1269.96 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   969.583   7549.626   0.128 0.898010    
## bmi            27.737     10.034   2.764 0.006541 ** 
## age            18.971      8.283   2.290 0.023622 *  
## pcsec         185.248     88.675   2.089 0.038666 *  
## palco         -96.423    104.069  -0.927 0.355900    
## parous         63.750    109.800   0.581 0.562518    
## cohortSV     -105.136     97.619  -1.077 0.283490    
## PC1_cells    -146.513    317.691  -0.461 0.645447    
## ga_meth        18.341      4.781   3.837 0.000194 ***
## sexM          150.266     70.839   2.121 0.035815 *  
## gmm         -6922.591  12027.295  -0.576 0.565906    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 411.8 on 129 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.3903, Adjusted R-squared:  0.343 
## F-statistic: 8.256 on 10 and 129 DF,  p-value: 2.978e-10
model_gmm_baby_bwgt <- lm(bwgt ~ bmi + age + pcsec + palco + parous + sex + ga_meth +
                             cohort + PC1_cells +  PC2_cells + gmm, data = dfb)
summary(model_gmm_baby_bwgt)
## 
## Call:
## lm(formula = bwgt ~ bmi + age + pcsec + palco + parous + sex + 
##     ga_meth + cohort + PC1_cells + PC2_cells + gmm, data = dfb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -908.05 -282.78  -35.77  251.19 1222.88 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2841.009   6539.612   0.434 0.664685    
## bmi             26.048     10.168   2.562 0.011533 *  
## age             16.889      8.223   2.054 0.041965 *  
## pcsec          182.882     87.093   2.100 0.037647 *  
## palco          -67.572    108.159  -0.625 0.533212    
## parous          43.828    107.156   0.409 0.683196    
## sexM           160.551     73.678   2.179 0.031101 *  
## ga_meth         19.839      5.026   3.947 0.000128 ***
## cohortSV      -105.658    100.026  -1.056 0.292757    
## PC1_cells     -684.314    662.472  -1.033 0.303507    
## PC2_cells     -634.333    646.705  -0.981 0.328451    
## gmm         -10371.518  10189.110  -1.018 0.310586    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 412.7 on 132 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.3927, Adjusted R-squared:  0.3421 
## F-statistic:  7.76 on 11 and 132 DF,  p-value: 2.965e-10

birthweight

Are any of the stress measures correlated with birth weight in a multivariate analysis?

model_gtsum <- lm(bwgt ~ bmi + age + pcsec + palco + parous + cohort + ga_meth + sex +
           gtsum, data = dfm)
summary(model_gtsum)
## 
## Call:
## lm(formula = bwgt ~ bmi + age + pcsec + palco + parous + cohort + 
##     ga_meth + sex + gtsum, data = dfm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -906.33 -276.85   -5.05  249.72 1350.42 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3183.342   1319.391  -2.413 0.017229 *  
## bmi            27.539      9.960   2.765 0.006524 ** 
## age            18.684      8.151   2.292 0.023491 *  
## pcsec         182.699     87.966   2.077 0.039778 *  
## palco         -97.647    103.207  -0.946 0.345840    
## parous         60.819    106.162   0.573 0.567710    
## cohortSV     -105.061     96.697  -1.086 0.279274    
## ga_meth        18.102      4.748   3.812 0.000212 ***
## sexM          159.603     70.503   2.264 0.025243 *  
## gtsum         -24.330     21.424  -1.136 0.258207    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 408.9 on 130 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.394,  Adjusted R-squared:  0.352 
## F-statistic:  9.39 on 9 and 130 DF,  p-value: 6.507e-11
model_setot <- lm(bwgt ~ bmi + age + pcsec + palco + parous + cohort + ga_meth + sex +
           setot, data = dfm)
summary(model_setot) 
## 
## Call:
## lm(formula = bwgt ~ bmi + age + pcsec + palco + parous + cohort + 
##     ga_meth + sex + setot, data = dfm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -998.48 -252.27  -12.67  252.08 1087.43 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3211.559   1312.740  -2.446 0.015763 *  
## bmi            27.728      9.927   2.793 0.006005 ** 
## age            21.250      8.285   2.565 0.011460 *  
## pcsec         189.762     87.760   2.162 0.032428 *  
## palco         -97.164    102.802  -0.945 0.346334    
## parous         66.234    105.931   0.625 0.532896    
## cohortSV     -113.724     96.723  -1.176 0.241839    
## ga_meth        17.636      4.748   3.715 0.000301 ***
## sexM          149.445     69.890   2.138 0.034364 *  
## setot          33.159     22.322   1.485 0.139834    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 407.5 on 130 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.3982, Adjusted R-squared:  0.3565 
## F-statistic: 9.557 on 9 and 130 DF,  p-value: 4.289e-11
model_awar_nr <- lm(bwgt ~ bmi + age + pcsec + palco + parous + cohort + ga_meth + sex +
           awar_nr, data = dfm)
summary(model_awar_nr) # significant nega_methtive association. p = 0.002. b = -115.8
## 
## Call:
## lm(formula = bwgt ~ bmi + age + pcsec + palco + parous + cohort + 
##     ga_meth + sex + awar_nr, data = dfm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -886.92 -275.65   -2.93  249.10 1277.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2759.771   1285.624  -2.147 0.033679 *  
## bmi            24.875      9.683   2.569 0.011328 *  
## age            19.899      7.900   2.519 0.012988 *  
## pcsec         214.687     85.759   2.503 0.013540 *  
## palco        -102.006     99.907  -1.021 0.309147    
## parous         88.800    103.254   0.860 0.391360    
## cohortSV     -126.836     93.918  -1.350 0.179202    
## ga_meth        16.904      4.616   3.662 0.000363 ***
## sexM          171.134     68.200   2.509 0.013325 *  
## awar_nr      -115.839     36.661  -3.160 0.001965 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 396 on 130 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.4316, Adjusted R-squared:  0.3923 
## F-statistic: 10.97 on 9 and 130 DF,  p-value: 1.37e-12
# n = 140, 11 observations deleted due to missingness.

# How many samples in this model?
nrow(model.frame(model_awar_nr)) # 140, out of 151 (11 NA observations deleted).
## [1] 140
model_achronic <- lm(bwgt ~ bmi + age + pcsec + palco + parous + cohort + ga_meth + sex +
           achronic, data = dfm)
summary(model_achronic)
## 
## Call:
## lm(formula = bwgt ~ bmi + age + pcsec + palco + parous + cohort + 
##     ga_meth + sex + achronic, data = dfm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -940.26 -275.79  -17.07  243.81 1285.39 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3197.809   1330.971  -2.403 0.017691 *  
## bmi            26.901     10.068   2.672 0.008505 ** 
## age            18.231      8.251   2.209 0.028891 *  
## pcsec         176.901     89.129   1.985 0.049276 *  
## palco         -92.168    103.518  -0.890 0.374921    
## parous         50.092    107.090   0.468 0.640744    
## cohortSV      -75.666    106.061  -0.713 0.476863    
## ga_meth        18.173      4.767   3.812 0.000212 ***
## sexM          153.429     70.499   2.176 0.031338 *  
## achronic       -6.154     11.350  -0.542 0.588634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 410.5 on 130 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.3893, Adjusted R-squared:  0.3471 
## F-statistic: 9.209 on 9 and 130 DF,  p-value: 1.026e-10

Are any of the top hits correlated with birthweight in a multivariate analysis?

cpg_model_mom_gtsum <- lapply(dfm_sig_gtsum[grep("cg",colnames(dfm_sig_gtsum))], function(x){
  
  summary(lm(bwgt ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells + ga_meth + sex +
                      x,
                      data = dfm_sig_gtsum))
})

cpg_model_mom_setot <- lapply(dfm_sig_setot[grep("cg",colnames(dfm_sig_setot))], function(x){
  
  summary(lm(bwgt ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells + ga_meth + sex +
                      x,
                      data = dfm_sig_setot))
})


cpg_model_mom_awar_nr <- lapply(dfm_sig_awar_nr[grep("cg",colnames(dfm_sig_awar_nr))], function(x){
  
  summary(lm(bwgt ~ Age + bmi + parous + pcsec + palco + cohort + PC1_cells + ga_meth + sex +
                      x,
                      data = dfm_sig_awar_nr))
})




# babies


cpg_model_baby_gtsum <- lapply(dfb_sig_gtsum[grep("cg",colnames(dfb_sig_gtsum))], function(x){
  
  summary(lm(bwgt ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                      x,
                      data = dfb_sig_gtsum))
})

cpg_model_baby_setot <- lapply(dfb_sig_setot[grep("cg",colnames(dfb_sig_setot))], function(x){
  
  summary(lm(bwgt ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                      x,
                      data = dfb_sig_setot))
})


cpg_model_baby_awar_nr <- lapply(dfb_sig_awar_nr[grep("cg",colnames(dfb_sig_awar_nr))], function(x){
  
  summary(lm(bwgt ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                      x,
                      data = dfb_sig_awar_nr))
})


nrow(model.frame(lm(bwgt ~ age + bmi + parous + pcsec + palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                      cg08985979,
                      data = dfb_sig_awar_nr)))
## [1] 144
# 144 participants included, 7 observations deleted due to missingness.

cg08985979_model <- check_model(lm(bwgt ~ age + bmi + parous + pcsec +
                                     palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                                     cg08985979,
                                   data = dfb_sig_awar_nr))

cg08985979_pred <- prediction(lm(bwgt ~ age + bmi + parous + pcsec +
                                     palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                                   cg08985979,
                                   data = dfb_sig_awar_nr))

cg08985979_pred <- 
    ggplot(cg08985979_pred, aes(x = cg08985979, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    ylab("Adjusted birthweight (grams)") +
    xlab("Methylation at cg08985979") +
    ggtitle("Birthweight is predicted by DNAm at cg08985979 among babies",
            subtitle = "beta = -4770.2, p = 0.03")

Analyze Overlap of Top Hits

# Get top 1000 significant sites for each analyses. Check for overlap
# across the four measures within generation and then check for overlap
# across generation within each of the four stress measures. Use
# Venn Diagrams.


top1k_mom_gtsum <- mom_gtsum_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

top1k_mom_setot <- mom_setot_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

top1k_mom_awar_nr <- mom_awar_nr_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

top1k_mom_achronic <- mom_achronic_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

##########################
# Babies top 1000 hits
#########################

top1k_baby_gtsum <- baby_gtsum_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

top1k_baby_setot <- baby_setot_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

top1k_baby_awar_nr <- baby_awar_nr_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

top1k_baby_achronic <- baby_achronic_man2 %>% 
  select(probe,pval) %>% 
  arrange(pval) %>% 
  slice_head(n = 1000)

# Within generation across all four stress measures

mom_within <- list(gen_trauma = top1k_mom_gtsum$probe,
                   sexual_events = top1k_mom_setot$probe,
                   war_stress = top1k_mom_awar_nr$probe,
                   chronic_stress = top1k_mom_achronic$probe)

mom_within_venn <- ggVennDiagram(mom_within) +
  labs(title = "Mothers top 1k hits") + 
  scale_x_continuous(expand = expansion(mult = .2))


baby_within <- list(gen_trauma = top1k_baby_gtsum$probe,
                   sexual_events = top1k_baby_setot$probe,
                   war_stress = top1k_baby_awar_nr$probe,
                   chronic_stress = top1k_baby_achronic$probe)

baby_within_venn <- ggVennDiagram(baby_within) +
  labs(title = "Babies top 1k hits") + 
  scale_x_continuous(expand = expansion(mult = .2))

within_stress_venn <- ggarrange(mom_within_venn,baby_within_venn, nrow = 2, common.legend = TRUE)



# Across generation for each stress measure

gtsum_across <- list(mothers = top1k_mom_gtsum$probe,
                     babies = top1k_baby_gtsum$probe)

gtsum_across_venn <- ggVennDiagram(gtsum_across) +
  labs(title = "General Trauma - Mother infant dyads")

setot_across <- list(mothers = top1k_mom_setot$probe,
                     babies = top1k_baby_setot$probe)

setot_across_venn <- ggVennDiagram(setot_across) +
  labs(title = "Sexual Events - Mother infant dyads")

awar_nr_across <- list(mothers = top1k_mom_awar_nr$probe,
                     babies = top1k_baby_awar_nr$probe)

awar_nr_across_venn <- ggVennDiagram(awar_nr_across) +
  labs(title = "General Trauma - Mother infant dyads")

achronic_across <- list(mothers = top1k_mom_achronic$probe,
                     babies = top1k_baby_achronic$probe)

achronic_across_venn <- ggVennDiagram(achronic_across) +
  labs(title = "General Trauma - Mother infant dyads")

between_stress_venn <- ggarrange(gtsum_across_venn,setot_across_venn,
          awar_nr_across_venn,achronic_across_venn,
          ncol = 2,
          nrow = 2, common.legend = TRUE)



eFORGE enrichment

Prepare files to upload to the eFORGE and eFORGE-TF websites for analysis. These sites can be accessed through links in this paper, which describes the two tools.

# Get all sites with top 125 hits for each EWAS and then
# write them to text files to be uploaded to the 
# eFORGE and eFORGE-TF websites. The author recommends using larger
# 125-1250 probes and separating hypo and hyper methylation probes.

# Mothers 

## General Trauma

# top 125 - hyper and hypo methylated combined

mom_gtsum_top_125 <- mom_gtsum_man2 %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)


write.table(mom_gtsum_top_125, 
            file = here("output","mothers_gtsum_top_125_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)

# top 125 - hyper methylated only

mom_gtsum_top_125_hyper <- mom_gtsum_man2 %>% 
  filter(coef > 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_gtsum_top_125_hyper, 
            file = here("output","mothers_gtsum_top_125_hyper_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# top 125 - hypo methylated only

mom_gtsum_top_125_hypo <- mom_gtsum_man2 %>% 
  filter(coef < 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_gtsum_top_125_hypo, 
            file = here("output","mothers_gtsum_top_125_hypo_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


## Sexual Events

# top 125 - hyper and hypo methylated combined 

mom_setot_top_125 <- mom_setot_man2 %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_setot_top_125, 
            file = here("output","mothers_setot_top_125_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)

# top 125 - hyper methylated only

mom_setot_top_125_hyper <- mom_setot_man2 %>% 
  filter(coef > 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_setot_top_125_hyper, 
            file = here("output","mothers_setot_top_125_hyper_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# top 125 - hypo methylated only

mom_setot_top_125_hypo <- mom_setot_man2 %>% 
  filter(coef < 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_setot_top_125_hypo, 
            file = here("output","mothers_setot_top_125_hypo_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


## War Stress

# top 125 - hyper and hypo methylated combined

mom_awar_nr_top_125 <- mom_awar_nr_man2 %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)


write.table(mom_awar_nr_top_125, 
            file = here("output","mothers_awar_nr_top_125_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)

# top 125 - hyper methylated only

mom_awar_nr_top_125_hyper <- mom_awar_nr_man2 %>% 
  filter(coef > 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_awar_nr_top_125_hyper, 
            file = here("output","mothers_awar_nr_top_125_hyper_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# top 125 - hypo methylated only

mom_awar_nr_top_125_hypo <- mom_awar_nr_man2 %>% 
  filter(coef < 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(mom_awar_nr_top_125_hypo, 
            file = here("output","mothers_awar_nr_top_125_hypo_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# Babies


## General Trauma

# top 125 - hyper and hypo methylated combined

baby_gtsum_top_125 <- baby_gtsum_man2 %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_gtsum_top_125, 
            file = here("output","babies_gtsum_top_125_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)

# top 125 - hyper methylated only

baby_gtsum_top_125_hyper <- baby_gtsum_man2 %>% 
  filter(coef > 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_gtsum_top_125_hyper, 
            file = here("output","babies_gtsum_top_125_hyper_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# top 125 - hypo methylated only

baby_gtsum_top_125_hypo <- baby_gtsum_man2 %>% 
  filter(coef < 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_gtsum_top_125_hypo, 
            file = here("output","babies_gtsum_top_125_hypo_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


## Sexual Events

# top 125 - hyper and hypo methylated combined

baby_setot_top_125 <- baby_setot_man2 %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_setot_top_125, 
            file = here("output","babies_setot_top_125_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)

# top 125 - hyper methylated only

baby_setot_top_125_hyper <- baby_setot_man2 %>% 
  filter(coef > 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_setot_top_125_hyper, 
            file = here("output","babies_setot_top_125_hyper_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# top 125 - hypo methylated only

baby_setot_top_125_hypo <- baby_setot_man2 %>% 
  filter(coef < 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_setot_top_125_hypo, 
            file = here("output","babies_setot_top_125_hypo_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


## War Stress

# top 125 - hyper and hypo methylated combined


baby_awar_nr_top_125 <- baby_awar_nr_man2 %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)


write.table(baby_awar_nr_top_125, 
            file = here("output","babies_awar_nr_top_125_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)

# top 125 - hyper methylated only

baby_awar_nr_top_125_hyper <- baby_awar_nr_man2 %>% 
  filter(coef > 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_awar_nr_top_125_hyper, 
            file = here("output","babies_awar_nr_top_125_hyper_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)


# top 125 - hypo methylated only

baby_awar_nr_top_125_hypo <- baby_awar_nr_man2 %>% 
  filter(coef < 0) %>% 
  slice_min(order_by = pval, n = 125) %>% 
  pull(probe)

write.table(baby_awar_nr_top_125_hypo, 
            file = here("output","babies_awar_nr_top_125_hypo_eforge.txt"),
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE)



Now read in results from the eFORGE website using the text files generated in the chunk above and make a heatmap of results for newborns only.

# Create list of text files
txt_files_ls = list.files(path=here("data"), pattern="*.gz") 
# Read the files in, assuming tab separator
txt_files_df <- lapply(txt_files_ls, function(x) {read.table(file = here("data",x), 
                                                             header = TRUE, 
                                                             sep ="\t")})
# Combine them
combined_df <- do.call("rbind", lapply(txt_files_df, as.data.frame)) 



edf <- combined_df %>% 
  mutate(exposure = c(rep(c("War Trauma"),39*3), 
         rep(c("General Trauma"), 39*3),
         rep(c("Sexual Trauma"),39*3))) %>% 
  mutate(analysis = c(rep(c("Top 125 Hyper"),39),
         rep(c("Top 125 Hypo"),39),
         rep(c("Top 125"),39),
         rep(c("Top 125 Hyper"),39),
         rep(c("Top 125 Hypo"),39),
         rep(c("Top 125"),39),
         rep(c("Top 125 Hyper"),39),
         rep(c("Top 125 Hypo"),39),
         rep(c("Top 125"),39))) %>% 
  mutate(exposure_analysis = paste(exposure,analysis,sep = " - "))



edf$Tissue2 <- fct_collapse(edf$Tissue,
                           Embryonic = c("ES Cell"),
                           iPS = c("IPS cell"),
                           Fetal = c("Feta Intestine Small",
                                     "Fetal Adrenal Gland",
                                     "Fetal Brain",
                                     "Fetal Heart",
                                     "Fetal Intestine Large",
                                     "Fetal Kidney",
                                     "Fetal Lung",
                                     "Fetal Muscle Leg",
                                     "Fetal Muscle Trunk",
                                     "Fetal Stomach",
                                     "Fetal Thymus"),
                           
                           Other = c("Blood","Breast","Gastric",
                                     "Lung","Ovary","Pancreas",
                                     "Placenta","Psoas Muscle",
                                     "Skin","Small Intestine"))

# order the tissue2 column

edf$Tissue2 <- factor(edf$Tissue2,
                     levels = c("Embryonic","iPS","Fetal","Other"),
                     ordered = TRUE)

edf2 <- edf %>% 
  slice_head(n=39) %>% 
  arrange(Tissue2)



# Get Qvalues into cells of a matrix with tissue type in rownames
# and exposure_analysis in colnames.

qmat <- matrix(edf$Qvalue, nrow = 39, ncol = 9,
               byrow = FALSE)

rownames(qmat) <- edf$Cell[1:39]
colnames(qmat) <- unique(edf$exposure_analysis)

# Now rearrange the columns so that general trauma
# are the first three columns, then sexual events,
# then war trauma

qmat1 <- qmat[,4:9]
qmat2 <- qmat[,1:3]
qmat3 <- cbind(qmat1,qmat2)

# rearrange rows to match the Tissue2 vector in edf2.

linker <- match(edf2$Cell,rownames(qmat3))
# 
qmat4 <- qmat3[linker,]

#rownames(qmat4) <- edf2$Cell

colnames(qmat4) <- rep(c("Top 125 Hyper","Top 125 Hypo","Top 125"),3)

col_fun = colorRamp2(c(0, 1), c("red","blue"))



# Make row annotation:
RowAnn <- data.frame(edf2$Tissue2)
  colnames(RowAnn) <- c("Tissue")
  colours2 <- list("Tissue" = c(
    "Other"="green",
    "Embryonic"="darkgreen",
    "Fetal" = "darkseagreen",
    "iPS" = "darkolivegreen2"))
  RowAnn <- HeatmapAnnotation(df=RowAnn, col=colours2, which="row")
  
tissue_matrix <- matrix(edf2$Tissue2[1:39],ncol = 1)
colnames(tissue_matrix) <- "Tissue"
rownames(tissue_matrix) <- rownames(qmat4)



hmap <- Heatmap(qmat4,
        name = "q value",
        col = col_fun,
        show_column_names = TRUE,
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        cell_fun = function(j, i, x, y, w, h, fill) {
          if (qmat4[i, j] < 0.001) {
          grid.text("***", x, y)
          } else if (qmat4[i, j] < 0.01) {
          grid.text("**", x, y)
          } else if (qmat4[i, j] < 0.05) {
          grid.text("*", x, y)
          }
          },
         column_split = rep(1:3, each = 3),
        width = unit(100,"mm"),
        heatmap_height = unit(180,"mm"),
        left_annotation = RowAnn,
        use_raster = TRUE,
        raster_quality = 5,
        top_annotation = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = 5:7),
        labels = c("general trauma", "sexual trauma", "war trauma"))))


draw(hmap, heatmap_legend_side="left", 
     annotation_legend_side="left",
     merge_legend = TRUE)

Epigenetic Age Analyses

Pick up analyses with the mage4 object created above.

Drop samples failing QC

Also drop samples marked for dropping at the top of the script due to sample problems like duplicates and siblings in the data set.

# Horvath recommends removing samples with corSampleVSgoldstandard < 0.8.
# This "gs" object created above contains the only sample failing this check.

# The rest of the samples to remove will be based on the Meffil
# quality control, which is more robust than
# the epigenetic clock sex checks, for example. Summarized in this object:

fail_qc
##  [1] "C006bb.1.2."  "C5bb.1.2"     "C6.bb.2.1a"   "C91bb.1.2"    "SV001b.b.1.2"
##  [6] "SV014b.b.1.1" "C084M"        "C086M"        "SV018M.1.1"   "SV043M.2.1a"
gs %in% fail_qc # TRUE.
## [1] TRUE
################################################################
# DROP THE SIBLINGS AND DUPLICATE SAMPLES in the dropper object
################################################################

droppers
##  [1] "C91bb.1.2"            "C102M.2.1."           "C102bb.2.1."         
##  [4] "C091M.1.1."           "C091bb.2.1."          "C103bb.2.1."         
##  [7] "C103M.1.1."           "C101M.1.1"            "C101bb.1.1"          
## [10] "C76bb.1.1"            "C76M.1.2"             "SV058M.1.1 - eq101"  
## [13] "SV069M.1.1"           "SV069b.b.1.1 - eq100" "SV058b.b.1.1"        
## [16] "SV043M.2.1a"
mom_mage <- mage3_mother %>% 
  filter(!methylation_id %in% fail_qc) %>% # Lose 4 samples to QC
  filter(!methylation_id %in% droppers) %>% # Lose 7 samples because of duplicate moms.
  mutate(bmi = mwgt/((mhgt/100)^2)) %>% 
  rename(parous = is_this_your_first_child) %>%
  # This line brings in the cell type principal components
  left_join(mom_pr_cells, by = c("methylation_id" = "methylation_id"))

baby_mage <- mage3_baby %>% 
  filter(!methylation_id %in% fail_qc) %>% # 4 samples lost to QC
  filter(!methylation_id %in% droppers) %>% # Lose 7 samples because of sibling samples.
  mutate(bmi = mwgt/((mhgt/100)^2)) %>% 
  rename(parous = is_this_your_first_child) %>% 
  # This line brings in the cell type principal components
  left_join(baby_pr_cells, by = c("methylation_id" = "methylation_id"))

# What are the differences between the EWAS analytic sample
# and the epigenetic age analytic sample?

setdiff(mom_mage$methylation_id,dfm$methylation_id)
## [1] "C25M1.1"       "C49M.1.2_400"  "C7M.1.1_clean" "C2M.1.1_250"
setdiff(baby_mage$methylation_id,dfb$methylation_id)
## [1] "C1bb.1.1"        "C25b.b.1.1"      "C34bb.1.1_clean" "C78bb.1.1_250"
# These differences are accounted for by the fact that
# we dropped some of the small batches for the EWAS
# analyses to enable batch correction using ComBat. Some
# of the batches were so small they were preventing batch
# correction. This isn't a recognized problem for
# epigenetic age analyses.

dim(baby_mage) # 155 babies
## [1] 155 161
dim(mom_mage) # 155 mothers
## [1] 155 160
# create the final wide dataset

mage_wide <- baby_mage %>% 
 full_join(mom_mage, by = c("dyad" = "dyad"), suffix = c("_baby","_mother"))

# 160 different dyads in the wide data set.

Multivariate Analyses

Test for associations between stress measures and age-appropriate DNA methylation clocks. Use Horvath, IEAA, and EEAA for babies. Use the same three plus DNAmTLAdjAge, GrimAge and PhenoAge for mothers. Also do a birthweight test.

mothers

mom_eaa_cols <- c("AgeAccelerationResidual",
                  "IEAA","EEAA","DNAmTLAdjAge",
                  "AgeAccelPheno","AgeAccelGrim")

mom_gtsum_eaa <- apply(mom_mage[mom_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + gtsum, data = mom_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + PC1_cells + gtsum, data = mom_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
})

mom_gtsum_eaa <- lapply(mom_gtsum_eaa,getValues)
mom_gtsum_eaa <- do.call(rbind,mom_gtsum_eaa)

mom_setot_eaa <- apply(mom_mage[mom_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + setot, data = mom_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + PC1_cells + setot, data = mom_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
}) 

mom_setot_eaa <- lapply(mom_setot_eaa,getValues)
mom_setot_eaa <- do.call(rbind,mom_setot_eaa)

mom_awar_nr_eaa <- apply(mom_mage[mom_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + awar_nr, data = mom_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + PC1_cells + awar_nr, data = mom_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
})

mom_awar_nr_eaa <- lapply(mom_awar_nr_eaa,getValues)
mom_awar_nr_eaa <- do.call(rbind,mom_awar_nr_eaa)

mom_achronic_eaa <- apply(mom_mage[mom_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + achronic, data = mom_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + PC1_cells + achronic, data = mom_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
})

mom_achronic_eaa <- lapply(mom_achronic_eaa,getValues)
mom_achronic_eaa <- do.call(rbind,mom_achronic_eaa)

babies

baby_eaa_cols <- c("AgeAccelerationResidual",
                  "IEAA","EEAA")

baby_gtsum_eaa <- apply(baby_mage[baby_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + gtsum, data = baby_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + PC1_cells + PC2_cells + gtsum, data = baby_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
})

baby_gtsum_eaa <- lapply(baby_gtsum_eaa,getValues)
baby_gtsum_eaa <- do.call(rbind,baby_gtsum_eaa)

baby_setot_eaa <- apply(baby_mage[baby_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + setot, data = baby_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + PC1_cells + PC2_cells + setot, data = baby_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
}) 

baby_setot_eaa <- lapply(baby_setot_eaa,getValues)
baby_setot_eaa <- do.call(rbind,baby_setot_eaa)

baby_awar_nr_eaa <- apply(baby_mage[baby_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + awar_nr, data = baby_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + PC1_cells + PC2_cells + awar_nr, data = baby_mage),
              conf.int = TRUE)
  return(list(res,res_cell))
})

baby_awar_nr_eaa <- lapply(baby_awar_nr_eaa,getValues)
baby_awar_nr_eaa <- do.call(rbind,baby_awar_nr_eaa)

baby_achronic_eaa <- apply(baby_mage[baby_eaa_cols], 2, function(x){
  
  res <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + achronic, data = baby_mage),
              conf.int = TRUE)
  res_cell <- tidy(lm(x ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + PC1_cells + PC2_cells + achronic, data = baby_mage),
                   conf.int = TRUE)
  return(list(res,res_cell))
})

baby_achronic_eaa <- lapply(baby_achronic_eaa,getValues)
baby_achronic_eaa <- do.call(rbind,baby_achronic_eaa)

Regression heatmap

Heatmap of results.

mom_gtsum_eaa <- mom_gtsum_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell",
                   "Telomere","Telomere_cell",
                   "PhenoAge","PhenoAge_cell",
                   "GrimAge","GrimAge_cell")) %>% 
  mutate(exposure = c("general trauma"))

mom_setot_eaa <- mom_setot_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell",
                   "Telomere","Telomere_cell",
                   "PhenoAge","PhenoAge_cell",
                   "GrimAge","GrimAge_cell")) %>% 
  mutate(exposure = c("sexual events"))

mom_awar_nr_eaa <- mom_awar_nr_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell",
                   "Telomere","Telomere_cell",
                   "PhenoAge","PhenoAge_cell",
                   "GrimAge","GrimAge_cell")) %>% 
  mutate(exposure = c("war stress"))

mom_achronic_eaa <- mom_achronic_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell",
                   "Telomere","Telomere_cell",
                   "PhenoAge","PhenoAge_cell",
                   "GrimAge","GrimAge_cell")) %>% 
  mutate(exposure = c("chronic stress"))

combo_mom <- rbind(mom_gtsum_eaa,mom_setot_eaa,mom_awar_nr_eaa,mom_achronic_eaa)

# Get a matrix of the coefficient values

mom_mat <- matrix(data = combo_mom$coef, byrow = FALSE, ncol = 4,nrow = 12)
rownames(mom_mat)<- c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell",
                   "Telomere","Telomere_cell",
                   "PhenoAge","PhenoAge_cell",
                   "GrimAge","GrimAge_cell")
colnames(mom_mat) <- c("general trauma","sexual trauma",
                     "war trauma", "chronic stress")

# Subset for the tests we want:
mom_mat <- mom_mat[-(grep("_cell",rownames(mom_mat))),]

# Get a matrix of the p values

pval_mom <- matrix(data = combo_mom$pval, byrow = FALSE, ncol = 4,nrow = 12)
rownames(pval_mom)<- c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell",
                   "Telomere","Telomere_cell",
                   "PhenoAge","PhenoAge_cell",
                   "GrimAge","GrimAge_cell")
colnames(pval_mom) <- c("general trauma","sexual trauma",
                     "war trauma", "chronic stress")

# Subset for the tests we want:
pval_mom <- pval_mom[-(grep("_cell",rownames(pval_mom))),]


print(mom_mat)
##          general trauma sexual trauma war trauma chronic stress
## Horvath     0.218788722    0.17113376 -0.2343213   -0.165975588
## IEAA        0.185559088    0.12639270 -0.1093203   -0.143285166
## EEAA        0.275574695    1.08649268 -0.1569762    0.207576419
## Telomere   -0.009541149   -0.02150548  0.0021936   -0.001063849
## PhenoAge    0.555257195    1.36186379 -0.5609807    0.185488241
## GrimAge     0.208747196    0.42459712 -0.1589974    0.064978777
print(pval_mom)
##          general trauma sexual trauma war trauma chronic stress
## Horvath       0.4968620   0.627564080  0.7000867      0.3492283
## IEAA          0.5228680   0.691159119  0.8420720      0.3702547
## EEAA          0.4286302   0.003864255  0.8114353      0.2787733
## Telomere      0.2730758   0.023092653  0.8940465      0.8248359
## PhenoAge      0.3244419   0.026238341  0.5984409      0.5506848
## GrimAge       0.1910909   0.014408464  0.5988246      0.4611069
col_fun = colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))



mom_eaa_heatmap <- Heatmap(mom_mat, 
        name = "beta", 
        col = col_fun,
        column_title = "Maternal Stress and Epigenetic Age Acceleration",
        row_title = "Mothers",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_names_rot = 0,
        column_names_centered = TRUE,
        cell_fun = function(j, i, x, y, w, h, fill) {
          if (pval_mom[i, j] < 0.001) {
          grid.text("***", x, y)
          } else if (pval_mom[i, j] < 0.01) {
          grid.text("**", x, y)
          } else if (pval_mom[i, j] < 0.05) {
          grid.text("*", x, y)
          }
          },
          row_gap = unit(0, "mm"), border = TRUE)
baby_gtsum_eaa <- baby_gtsum_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell")) %>% 
  mutate(exposure = c("general trauma"))

baby_setot_eaa <- baby_setot_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell")) %>% 
  mutate(exposure = c("sexual events"))

baby_awar_nr_eaa <- baby_awar_nr_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell")) %>% 
  mutate(exposure = c("war stress"))

baby_achronic_eaa <- baby_achronic_eaa %>% 
  select(coef,pval) %>% 
  mutate(clock = c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell")) %>% 
  mutate(exposure = c("chronic stress"))

combo_baby <- rbind(baby_gtsum_eaa,baby_setot_eaa,baby_awar_nr_eaa,baby_achronic_eaa)

# Get a matrix of the coefficient values

baby_mat <- matrix(data = combo_baby$coef, byrow = FALSE, ncol = 4,nrow = 6)
rownames(baby_mat)<- c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell")
colnames(baby_mat) <- c("general trauma","sexual trauma",
                     "war trauma", "chronic stress")

# Subset for the tests we want:
baby_mat <- baby_mat[-(grep("_cell",rownames(baby_mat))),]


# Get a matrix of the p values

pval_baby <- matrix(data = combo_baby$pval, byrow = FALSE, ncol = 4,nrow = 6)
rownames(pval_baby)<- c("Horvath","Horvath_cell",
                   "IEAA","IEAA_cell",
                   "EEAA","EEAA_cell")
colnames(pval_baby) <- c("general trauma","sexual trauma",
                     "war trauma", "chronic stress")

# Subset for the tests we want:
pval_baby <- pval_baby[-(grep("_cell",rownames(pval_baby))),]

print(baby_mat)
##         general trauma sexual trauma  war trauma chronic stress
## Horvath    -0.01825929  -0.007533342 -0.02851414   -0.004013180
## IEAA       -0.12706067   0.045669226 -0.17196381    0.006182543
## EEAA        0.69677656   0.297129153  1.12135175    0.072583576
print(pval_baby)
##         general trauma sexual trauma war trauma chronic stress
## Horvath     0.21230177     0.6276206 0.28782919      0.6021742
## IEAA        0.14631021     0.6229336 0.28379519      0.8932359
## EEAA        0.02400518     0.3668638 0.04776305      0.6569516
col_fun = colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))

baby_eaa_heatmap <- Heatmap(baby_mat, 
        name = "beta", 
        col = col_fun,
        row_title = "Newborns",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        column_names_rot = 0,
        column_names_centered = TRUE,
        cell_fun = function(j, i, x, y, w, h, fill) {
          if (pval_baby[i, j] < 0.001) {
          grid.text("***", x, y)
          } else if (pval_baby[i, j] < 0.01) {
          grid.text("**", x, y)
          } else if (pval_baby[i, j] < 0.05) {
          grid.text("*", x, y)
          }
          },
          row_gap = unit(0, "mm"), border = TRUE)

Analyses of cell type proportions

Should be using beta regression for this since the range is 0-1. First mothers.

# There is a problem with the NK column because several people have
# a value indistinguishable from zero. Add 0.000000001 to each measurement
# just to make the function run.

dfm$NK <- dfm$NK + 1e-09

mom_cell_cols <- c("Bcell","CD4T","CD8T","Mono","Neu","NK")



mom_gtsum_cell <- apply(dfm[mom_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + gtsum, 
                 data = dfm), conf.int = TRUE)
  return(res)
})


mom_setot_cell <- apply(dfm[mom_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + setot, 
                 data = dfm), conf.int = TRUE)
  return(res)
})


mom_awar_nr_cell <- apply(dfm[mom_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + awar_nr, 
                 data = dfm), conf.int = TRUE)
  return(res)
})


mom_achronic_cell <- apply(dfm[mom_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + achronic, 
                 data = dfm), conf.int = TRUE)
  return(res)
})

# For reasons I will never understand, the code here breaks when
# it is inside of a function, but works outside of a function.
# Just write out everything.

coef <- sapply(mom_gtsum_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(mom_gtsum_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(mom_gtsum_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(mom_gtsum_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(mom_gtsum_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(mom_gtsum_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_mom_gtsum <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)


coef <- sapply(mom_setot_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(mom_setot_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(mom_setot_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(mom_setot_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(mom_setot_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(mom_setot_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_mom_setot <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)


coef <- sapply(mom_awar_nr_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(mom_awar_nr_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(mom_awar_nr_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(mom_awar_nr_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(mom_awar_nr_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(mom_awar_nr_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_mom_awar_nr <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)


coef <- sapply(mom_achronic_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(mom_achronic_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(mom_achronic_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(mom_achronic_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(mom_achronic_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(mom_achronic_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_mom_achronic <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)





df_mom_gtsum <- df_mom_gtsum %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Mono","Neu","NK")) %>% 
  mutate(exposure = c("general trauma"))


df_mom_setot <- df_mom_setot %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Mono","Neu","NK")) %>% 
  mutate(exposure = c("sexual events"))


df_mom_awar_nr <- df_mom_awar_nr %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Mono","Neu","NK")) %>% 
  mutate(exposure = c("war stress"))


df_mom_achronic <- df_mom_achronic %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Mono","Neu","NK")) %>% 
  mutate(exposure = c("chronic stress"))


mom_combo_cell <- rbind(df_mom_gtsum,df_mom_setot,df_mom_awar_nr,df_mom_achronic)


# Get a matrix of the coefficient values

mom_mat_cell <- matrix(data = mom_combo_cell$coef, byrow = FALSE, ncol = 4,nrow = 6)
rownames(mom_mat_cell)<- c("Bcell","CD4T","CD8T","Mono","Neu","NK")
colnames(mom_mat_cell) <- c("general trauma","sexual events",
                     "war trauma", "chronic stress")

# Get a matrix of the p values


mom_mat_cell_pval <- matrix(data = mom_combo_cell$pval, byrow = FALSE, ncol = 4,nrow = 6)
rownames(mom_mat_cell_pval)<- c("Bcell","CD4T","CD8T","Mono","Neu","NK")
colnames(mom_mat_cell_pval) <- c("general trauma","sexual events",
                     "war trauma", "chronic stress")


col_fun2 = colorRamp2(c(-0.1,0, 0.1), c("blue","white", "red"))

mom_cell_heatmap <- Heatmap(mom_mat_cell, 
        name = "beta", 
        col = col_fun2,
        column_title = "Maternal Stress - Results in Mothers",
        row_title = "Immune Cell types",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        cell_fun = function(j, i, x, y, w, h, fill) {
          if (mom_mat_cell_pval[i, j] < 0.001) {
          grid.text("***", x, y)
          } else if (mom_mat_cell_pval[i, j] < 0.01) {
          grid.text("**", x, y)
          } else if (mom_mat_cell_pval[i, j] < 0.05) {
          grid.text("*", x, y)
          }
          },
          row_gap = unit(0, "mm"), border = TRUE)
# There is a problem with the NK column because several people have
# a value indistinguishable from zero. Add 0.000000001 to each measurement
# just to make the function run.

dfb$Bcell <- dfb$Bcell + 1e-09
dfb$CD4T <- dfb$CD4T + 1e-09
dfb$CD8T <- dfb$CD8T + 1e-09
dfb$NK <- dfb$NK + 1e-09


baby_cell_cols <- c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")



baby_gtsum_cell <- apply(dfb[baby_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + sex + gtsum, 
                 data = dfb), conf.int = TRUE)
  return(res)
})


baby_setot_cell <- apply(dfb[baby_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + sex + setot, 
                 data = dfb), conf.int = TRUE)
  return(res)
})


baby_awar_nr_cell <- apply(dfb[baby_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + sex + awar_nr, 
                 data = dfb), conf.int = TRUE)
  return(res)
})


baby_achronic_cell <- apply(dfb[baby_cell_cols], 2, function(x){
  
  res <- tidy(betareg(x ~ bmi + pcsec + palco + parous + cohort + age + ga_meth 
                 + sex + achronic, 
                 data = dfb), conf.int = TRUE)
  return(res)
})

# For reasons I will never understand, the code here breaks when
# it is inside of a function, but works outside of a function.
# Just write out everything.

coef <- sapply(baby_gtsum_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(baby_gtsum_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(baby_gtsum_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(baby_gtsum_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(baby_gtsum_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(baby_gtsum_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_baby_gtsum <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)


coef <- sapply(baby_setot_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(baby_setot_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(baby_setot_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(baby_setot_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(baby_setot_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(baby_setot_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_baby_setot <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)


coef <- sapply(baby_awar_nr_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(baby_awar_nr_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(baby_awar_nr_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(baby_awar_nr_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(baby_awar_nr_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(baby_awar_nr_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_baby_awar_nr <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)


coef <- sapply(baby_achronic_cell, function (x) x[[length(x[[1]])-1,"estimate"]])
std_error <- sapply(baby_achronic_cell, function (x) x[[length(x[[1]])-1,"std.error"]])
test_stat <- sapply(baby_achronic_cell, function (x) x[[length(x[[1]])-1,"statistic"]])
pval <- sapply(baby_achronic_cell, function (x) x[[length(x[[1]])-1,"p.value"]])
conf_low <- sapply(baby_achronic_cell, function (x) x[[length(x[[1]])-1,"conf.low"]])
conf_high <- sapply(baby_achronic_cell, function (x) x[[length(x[[1]])-1,"conf.high"]])
df_baby_achronic <- cbind.data.frame(coef,std_error,test_stat,pval,conf_low,conf_high)





df_baby_gtsum <- df_baby_gtsum %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")) %>% 
  mutate(exposure = c("general trauma"))


df_baby_setot <- df_baby_setot %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")) %>% 
  mutate(exposure = c("sexual events"))


df_baby_awar_nr <- df_baby_awar_nr %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")) %>% 
  mutate(exposure = c("war stress"))


df_baby_achronic <- df_baby_achronic %>% 
  select(coef,pval) %>% 
  mutate(cell_type = c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")) %>% 
  mutate(exposure = c("chronic stress"))


baby_combo_cell <- rbind(df_baby_gtsum,df_baby_setot,df_baby_awar_nr,df_baby_achronic)


# Get a matrix of the coefficient values

baby_mat_cell <- matrix(data = baby_combo_cell$coef, byrow = FALSE, ncol = 4,nrow = 7)
rownames(baby_mat_cell)<- c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")
colnames(baby_mat_cell) <- c("general trauma","sexual events",
                     "war trauma", "chronic stress")

# Get a matrix of the p values


baby_mat_cell_pval <- matrix(data = baby_combo_cell$pval, byrow = FALSE, ncol = 4,nrow = 7)
rownames(baby_mat_cell_pval)<- c("Bcell","CD4T","CD8T","Gran","Mono","NK","nRBC")
colnames(baby_mat_cell_pval) <- c("general trauma","sexual events",
                     "war trauma", "chronic stress")


col_fun2 = colorRamp2(c(-0.1,0, 0.1), c("blue","white", "red"))

baby_cell_heatmap <- Heatmap(baby_mat_cell, 
        name = "beta", 
        col = col_fun2,
        column_title = "Maternal Stress - Results in Babies",
        row_title = "Immune Cell types",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        cell_fun = function(j, i, x, y, w, h, fill) {
          if (baby_mat_cell_pval[i, j] < 0.001) {
          grid.text("***", x, y)
          } else if (baby_mat_cell_pval[i, j] < 0.01) {
          grid.text("**", x, y)
          } else if (baby_mat_cell_pval[i, j] < 0.05) {
          grid.text("*", x, y)
          }
          },
          row_gap = unit(0, "mm"), border = TRUE)

Final Report

Table 1 Mothers descriptives

# Make a pvalue function for table1:

pvalue_table <- function(x, ...) {
  x <- x[-length(x)]  # Remove "overall" group
  # Construct vectors of data y, and groups (strata) g
  y <- unlist(x)
  g <- factor(rep(1:length(x), times=sapply(x, length)))
  if (is.numeric(y)) {
    # For numeric variables, perform an ANOVA
    p <- summary(aov(y ~ g))[[1]][["Pr(>F)"]][1]
  } else {
    # For categorical variables, perform a chi-squared test of independence
    p <- fisher.test(table(y, g))$p.value
  }
  # Format the p-value, using an HTML entity for the less-than sign.
  # The initial empty string places the output on the line below the variable label.
  c("", sub("<", "&lt;", format.pval(p, digits=3, eps=0.001)))
}



dfm$Delivery_Mode <- factor(dfm$pcsec, levels = c(0,1),
                            labels = c("vaginal","caesarean section"))

dfm$Alcohol <- factor(dfm$palco, levels = c(0,1),
                            labels = c("No","Yes"))

dfm$Parity <- factor(dfm$parous, levels = c(0,1),
                     labels = c("Multigravida","Primigravida"))

dfm$Gestational_Age <- dfm$ga_meth/7

dfm$General_Trauma <- dfm$gtsum
dfm$Sexual_Events <- dfm$setot
dfm$War_Stress <- dfm$awar_nr
dfm$Chronic_Stress <- dfm$achronic

dfm$Cohort <- factor(dfm$cohort, levels = c("C","SV"),
                     labels = c("General Maternity Ward",
                                "Sexual Violence Ward"))


table1(~ age + bmi + Delivery_Mode + Alcohol + Parity + Gestational_Age +
         General_Trauma + Sexual_Events + War_Stress + Chronic_Stress
       | Cohort, data=dfm, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=86)
Sexual Violence Ward
(N=65)
p Total
(N=151)
age
Mean (SD) 26.2 (6.07) 17.3 (3.99) <0.001 22.5 (6.86)
Median [Min, Max] 25.0 [14.0, 42.0] 17.0 [12.0, 33.0] 21.0 [12.0, 42.0]
Missing 0 (0%) 2 (3.1%) 2 (1.3%)
bmi
Mean (SD) 28.2 (4.15) 25.8 (2.78) <0.001 27.2 (3.81)
Median [Min, Max] 27.1 [21.4, 39.6] 25.7 [21.0, 33.7] 26.3 [21.0, 39.6]
Missing 0 (0%) 1 (1.5%) 1 (0.7%)
Delivery_Mode
vaginal 70 (81.4%) 48 (73.8%) 0.321 118 (78.1%)
caesarean section 16 (18.6%) 17 (26.2%) 33 (21.9%)
Alcohol
No 71 (82.6%) 56 (86.2%) 0.353 127 (84.1%)
Yes 15 (17.4%) 7 (10.8%) 22 (14.6%)
Missing 0 (0%) 2 (3.1%) 2 (1.3%)
Parity
Multigravida 59 (68.6%) 8 (12.3%) <0.001 67 (44.4%)
Primigravida 24 (27.9%) 55 (84.6%) 79 (52.3%)
Missing 3 (3.5%) 2 (3.1%) 5 (3.3%)
Gestational_Age
Mean (SD) 40.1 (1.02) 39.5 (1.43) 0.008 39.9 (1.24)
Median [Min, Max] 40.2 [37.1, 42.6] 39.8 [33.2, 41.5] 40.1 [33.2, 42.6]
Missing 3 (3.5%) 2 (3.1%) 5 (3.3%)
General_Trauma
Mean (SD) 2.45 (1.84) 2.22 (1.45) 0.391 2.35 (1.68)
Median [Min, Max] 2.00 [0, 8.00] 2.00 [0, 8.00] 2.00 [0, 8.00]
Sexual_Events
Mean (SD) 0.942 (1.63) 1.72 (1.60) 0.004 1.28 (1.66)
Median [Min, Max] 0 [0, 6.00] 1.00 [0, 6.00] 1.00 [0, 6.00]
War_Stress
Mean (SD) 1.05 (0.893) 0.954 (0.959) 0.542 1.01 (0.920)
Median [Min, Max] 1.00 [0, 5.00] 1.00 [0, 4.00] 1.00 [0, 5.00]
Chronic_Stress
Mean (SD) 4.49 (3.54) 8.43 (2.87) <0.001 6.19 (3.80)
Median [Min, Max] 3.00 [0, 14.0] 9.00 [0, 14.0] 6.00 [0, 14.0]

Make a descriptive statistics table that is inclusive of the participants included in the epigenetic age analyses.

mom_mage$Delivery_Mode <- factor(mom_mage$pcsec, levels = c(0,1),
                            labels = c("vaginal","caesarean section"))

mom_mage$Alcohol <- factor(mom_mage$palco, levels = c(0,1),
                            labels = c("No","Yes"))

mom_mage$Parity <- factor(mom_mage$parous, levels = c(0,1),
                     labels = c("Multigravida","Primigravida"))

mom_mage$Gestational_Age <- mom_mage$ga_meth/7

mom_mage$General_Trauma <- mom_mage$gtsum
mom_mage$Sexual_Events <- mom_mage$setot
mom_mage$War_Stress <- mom_mage$awar_nr
mom_mage$Chronic_Stress <- mom_mage$achronic

mom_mage$Cohort <- factor(mom_mage$cohort, levels = c("C","SV"),
                     labels = c("General Maternity Ward",
                                "Sexual Violence Ward"))

table1(~ age + bmi + Delivery_Mode + Alcohol + Parity + Gestational_Age +
         General_Trauma + Sexual_Events + War_Stress + Chronic_Stress
       | Cohort, data=mom_mage, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=90)
Sexual Violence Ward
(N=65)
p Total
(N=155)
age
Mean (SD) 26.3 (5.95) 17.3 (3.99) <0.001 22.6 (6.83)
Median [Min, Max] 26.0 [14.0, 42.0] 17.0 [12.0, 33.0] 21.0 [12.0, 42.0]
Missing 0 (0%) 2 (3.1%) 2 (1.3%)
bmi
Mean (SD) 28.1 (4.08) 25.8 (2.78) <0.001 27.2 (3.76)
Median [Min, Max] 27.0 [21.4, 39.6] 25.7 [21.0, 33.7] 26.3 [21.0, 39.6]
Missing 0 (0%) 1 (1.5%) 1 (0.6%)
Delivery_Mode
vaginal 73 (81.1%) 48 (73.8%) 0.327 121 (78.1%)
caesarean section 17 (18.9%) 17 (26.2%) 34 (21.9%)
Alcohol
No 75 (83.3%) 56 (86.2%) 0.362 131 (84.5%)
Yes 15 (16.7%) 7 (10.8%) 22 (14.2%)
Missing 0 (0%) 2 (3.1%) 2 (1.3%)
Parity
Multigravida 63 (70.0%) 8 (12.3%) <0.001 71 (45.8%)
Primigravida 24 (26.7%) 55 (84.6%) 79 (51.0%)
Missing 3 (3.3%) 2 (3.1%) 5 (3.2%)
Gestational_Age
Mean (SD) 40.1 (1.00) 39.5 (1.43) 0.006 39.9 (1.23)
Median [Min, Max] 40.3 [37.1, 42.6] 39.8 [33.2, 41.5] 40.1 [33.2, 42.6]
Missing 4 (4.4%) 2 (3.1%) 6 (3.9%)
General_Trauma
Mean (SD) 2.46 (1.88) 2.22 (1.45) 0.391 2.35 (1.71)
Median [Min, Max] 2.00 [0, 8.00] 2.00 [0, 8.00] 2.00 [0, 8.00]
Sexual_Events
Mean (SD) 0.956 (1.63) 1.72 (1.60) 0.004 1.28 (1.66)
Median [Min, Max] 0 [0, 6.00] 1.00 [0, 6.00] 1.00 [0, 6.00]
War_Stress
Mean (SD) 1.04 (0.886) 0.954 (0.959) 0.545 1.01 (0.915)
Median [Min, Max] 1.00 [0, 5.00] 1.00 [0, 4.00] 1.00 [0, 5.00]
Chronic_Stress
Mean (SD) 4.63 (3.60) 8.43 (2.87) <0.001 6.23 (3.80)
Median [Min, Max] 3.00 [0, 14.0] 9.00 [0, 14.0] 6.00 [0, 14.0]

Table 2 Babies descriptives

dfb$Delivery_Mode <- factor(dfb$pcsec, levels = c(0,1),
                            labels = c("vaginal","caesarean section"))

dfb$Alcohol <- factor(dfb$palco, levels = c(0,1),
                            labels = c("No","Yes"))

dfb$Parity <- factor(dfb$parous, levels = c(0,1),
                     labels = c("Multigravida","Primigravida"))

dfb$Gestational_Age <- dfb$ga_meth/7

dfb$General_Trauma <- dfb$gtsum
dfb$Sexual_Events <- dfb$setot
dfb$War_Stress <- dfb$awar_nr
dfb$Chronic_Stress <- dfb$achronic

dfb$Cohort <- factor(dfb$cohort, levels = c("C","SV"),
                     labels = c("General Maternity Ward",
                                "Sexual Violence Ward"))
dfb$Sex <- factor(dfb$sex, levels = c("F","M"),
                  labels = c("Female","Male"))

table1(~ Sex + bmi + Delivery_Mode + Alcohol + Parity + Gestational_Age +
         General_Trauma + Sexual_Events + War_Stress + Chronic_Stress
       | Cohort, data=dfb, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=86)
Sexual Violence Ward
(N=65)
p Total
(N=151)
Sex
Female 43 (50.0%) 33 (50.8%) 1 76 (50.3%)
Male 43 (50.0%) 32 (49.2%) 75 (49.7%)
bmi
Mean (SD) 28.2 (4.17) 25.9 (2.80) <0.001 27.2 (3.81)
Median [Min, Max] 27.0 [21.4, 39.6] 25.8 [21.0, 33.7] 26.4 [21.0, 39.6]
Missing 0 (0%) 1 (1.5%) 1 (0.7%)
Delivery_Mode
vaginal 69 (80.2%) 46 (70.8%) 0.184 115 (76.2%)
caesarean section 17 (19.8%) 19 (29.2%) 36 (23.8%)
Alcohol
No 71 (82.6%) 55 (84.6%) 0.355 126 (83.4%)
Yes 15 (17.4%) 7 (10.8%) 22 (14.6%)
Missing 0 (0%) 3 (4.6%) 3 (2.0%)
Parity
Multigravida 62 (72.1%) 8 (12.3%) <0.001 70 (46.4%)
Primigravida 22 (25.6%) 54 (83.1%) 76 (50.3%)
Missing 2 (2.3%) 3 (4.6%) 5 (3.3%)
Gestational_Age
Mean (SD) 40.1 (1.00) 39.6 (1.44) 0.011 39.9 (1.23)
Median [Min, Max] 40.3 [37.1, 42.6] 39.8 [33.2, 41.5] 40.1 [33.2, 42.6]
General_Trauma
Mean (SD) 2.41 (1.82) 2.22 (1.46) 0.488 2.32 (1.68)
Median [Min, Max] 2.00 [0, 8.00] 2.00 [0, 8.00] 2.00 [0, 8.00]
Sexual_Events
Mean (SD) 0.919 (1.65) 1.69 (1.60) 0.004 1.25 (1.67)
Median [Min, Max] 0 [0, 6.00] 1.00 [0, 6.00] 1.00 [0, 6.00]
War_Stress
Mean (SD) 1.06 (0.886) 0.969 (0.968) 0.558 1.02 (0.920)
Median [Min, Max] 1.00 [0, 5.00] 1.00 [0, 4.00] 1.00 [0, 5.00]
Chronic_Stress
Mean (SD) 4.51 (3.63) 8.40 (3.10) <0.001 6.19 (3.91)
Median [Min, Max] 3.00 [0, 14.0] 9.00 [0, 14.0] 6.00 [0, 14.0]

Make a descriptive statistics table that is inclusive of the newborns included in the epigenetic age analyses.

baby_mage$Delivery_Mode <- factor(baby_mage$pcsec, levels = c(0,1),
                            labels = c("vaginal","caesarean section"))

baby_mage$Alcohol <- factor(baby_mage$palco, levels = c(0,1),
                            labels = c("No","Yes"))

baby_mage$Parity <- factor(baby_mage$parous, levels = c(0,1),
                     labels = c("Multigravida","Primigravida"))

baby_mage$Gestational_Age <- baby_mage$ga_meth/7

baby_mage$General_Trauma <- baby_mage$gtsum
baby_mage$Sexual_Events <- baby_mage$setot
baby_mage$War_Stress <- baby_mage$awar_nr
baby_mage$Chronic_Stress <- baby_mage$achronic

baby_mage$Cohort <- factor(baby_mage$cohort, levels = c("C","SV"),
                     labels = c("General Maternity Ward",
                                "Sexual Violence Ward"))
baby_mage$Sex <- factor(baby_mage$sex, levels = c("F","M"),
                  labels = c("Female","Male"))

table1(~ Sex + age + bmi + Delivery_Mode + Alcohol + Parity + Gestational_Age +
         General_Trauma + Sexual_Events + War_Stress + Chronic_Stress
       | Cohort, data=baby_mage, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=90)
Sexual Violence Ward
(N=65)
p Total
(N=155)
Sex
Female 43 (47.8%) 33 (50.8%) 0.747 76 (49.0%)
Male 47 (52.2%) 32 (49.2%) 79 (51.0%)
age
Mean (SD) 26.2 (5.92) 17.5 (3.97) <0.001 22.6 (6.75)
Median [Min, Max] 26.0 [14.0, 42.0] 17.0 [13.0, 33.0] 21.0 [13.0, 42.0]
Missing 0 (0%) 3 (4.6%) 3 (1.9%)
bmi
Mean (SD) 28.1 (4.09) 25.9 (2.80) <0.001 27.2 (3.76)
Median [Min, Max] 27.0 [21.4, 39.6] 25.8 [21.0, 33.7] 26.4 [21.0, 39.6]
Missing 0 (0%) 1 (1.5%) 1 (0.6%)
Delivery_Mode
vaginal 73 (81.1%) 46 (70.8%) 0.177 119 (76.8%)
caesarean section 17 (18.9%) 19 (29.2%) 36 (23.2%)
Alcohol
No 75 (83.3%) 55 (84.6%) 0.483 130 (83.9%)
Yes 15 (16.7%) 7 (10.8%) 22 (14.2%)
Missing 0 (0%) 3 (4.6%) 3 (1.9%)
Parity
Multigravida 63 (70.0%) 8 (12.3%) <0.001 71 (45.8%)
Primigravida 24 (26.7%) 54 (83.1%) 78 (50.3%)
Missing 3 (3.3%) 3 (4.6%) 6 (3.9%)
Gestational_Age
Mean (SD) 40.1 (1.00) 39.6 (1.44) 0.011 39.9 (1.23)
Median [Min, Max] 40.3 [37.1, 42.6] 39.8 [33.2, 41.5] 40.1 [33.2, 42.6]
Missing 4 (4.4%) 0 (0%) 4 (2.6%)
General_Trauma
Mean (SD) 2.42 (1.90) 2.22 (1.46) 0.464 2.34 (1.73)
Median [Min, Max] 2.00 [0, 8.00] 2.00 [0, 8.00] 2.00 [0, 8.00]
Sexual_Events
Mean (SD) 0.911 (1.63) 1.69 (1.60) 0.003 1.24 (1.66)
Median [Min, Max] 0 [0, 6.00] 1.00 [0, 6.00] 1.00 [0, 6.00]
War_Stress
Mean (SD) 1.04 (0.886) 0.969 (0.968) 0.617 1.01 (0.919)
Median [Min, Max] 1.00 [0, 5.00] 1.00 [0, 4.00] 1.00 [0, 5.00]
Chronic_Stress
Mean (SD) 4.57 (3.58) 8.40 (3.10) <0.001 6.17 (3.87)
Median [Min, Max] 3.00 [0, 14.0] 9.00 [0, 14.0] 6.00 [0, 14.0]

Table 3 ewas hits mothers

mom_sig_gtsum <- mom_gtsum_man2 %>% 
  filter(pval < (0.05/nrow(mom_gtsum_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("general_trauma")) %>% 
  mutate(generation = c("mother"))

mom_sig_setot <- mom_setot_man2 %>% 
  filter(pval < (0.05/nrow(mom_setot_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("sexual_events"))%>% 
  mutate(generation = c("mother"))

mom_sig_awar_nr <- mom_awar_nr_man2 %>% 
  filter(pval < (0.05/nrow(mom_awar_nr_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("war_stress"))%>% 
  mutate(generation = c("mother"))

mom_sig_achronic <- mom_achronic_man2 %>% 
  filter(pval < (0.05/nrow(mom_achronic_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("chronic_stress"))%>% 
  mutate(generation = c("mother"))

baby_sig_gtsum <- baby_gtsum_man2 %>% 
  filter(pval < (0.05/nrow(baby_gtsum_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("general_trauma")) %>% 
  mutate(generation = c("baby"))

baby_sig_setot <- baby_setot_man2 %>% 
  filter(pval < (0.05/nrow(baby_setot_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("sexual_events"))%>% 
  mutate(generation = c("baby"))

baby_sig_awar_nr <- baby_awar_nr_man2 %>% 
  filter(pval < (0.05/nrow(baby_awar_nr_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("war_stress"))%>% 
  mutate(generation = c("baby"))

baby_sig_achronic <- baby_achronic_man2 %>% 
  filter(pval < (0.05/nrow(baby_achronic_man2))) %>% 
  select(probe,coef,pval) %>% 
  mutate(exposure = c("chronic_stress")) %>% 
  mutate(generation = c("baby"))

mom_all_sig <- bind_rows(mom_sig_gtsum,
                         mom_sig_setot,
                         mom_sig_awar_nr,
                         mom_sig_achronic)

baby_all_sig <- bind_rows(baby_sig_gtsum,
                         baby_sig_setot,
                         baby_sig_awar_nr,
                         baby_sig_achronic)

library(scales)

mom_all_sig2 <- zhou %>% 
  select(probeID,gene) %>% 
  right_join(mom_all_sig, by = c("probeID" = "probe")) %>% 
  mutate(coefficient = round(coef, digits = 3)) %>% 
  mutate(p_value = scientific(pval, digits = 3)) %>% 
  left_join(zhou2, by = c("probeID")) %>% 
  left_join(illumina2, by = c("probeID")) %>% 
  select(probeID,coefficient,p_value,exposure,
         generation,gene,CGIposition,distToTSS,gene_context)


baby_all_sig2 <- zhou %>% 
  select(probeID,gene) %>% 
  right_join(baby_all_sig, by = c("probeID" = "probe")) %>% 
  mutate(coefficient = round(coef, digits = 3)) %>% 
  mutate(p_value = scientific(pval, digits = 3)) %>% 
  left_join(zhou2, by = c("probeID")) %>% 
  left_join(illumina2, by = c("probeID")) %>% 
  select(probeID,coefficient,p_value,exposure,
         generation,gene,CGIposition,distToTSS,gene_context) 


datatable(mom_all_sig2[c("exposure","probeID","coefficient","p_value","gene","CGIposition",
                         "gene_context")],
          filter = "top", rownames = FALSE, width = '100%',
          options = list(scrollX = TRUE),
          caption = htmltools::tags$caption(style = 'caption-side: top; text-align: center; color:black; font-size:200% ;','All significant hits in mothers across four maternal stress EWAS'))
# Table of significant hits for mothers

mom_all_sig2 %>% 
select(exposure,probeID,coefficient,p_value,gene,CGIposition,gene_context) %>% 
arrange(exposure) %>% 
kbl() %>% 
  kable_styling(full_width = FALSE) %>% 
  kable_material() %>% 
  pack_rows(group_label = "General Trauma",1,4) %>% 
  pack_rows(group_label = "Sexual Events",5,13) %>% 
  pack_rows(group_label = "War Stress",14,15)
exposure probeID coefficient p_value gene CGIposition gene_context
General Trauma
general_trauma cg11408019 0.003 1.10e-10 NA Island
general_trauma cg14519777 0.006 6.87e-08 CTA-339C12.1;CUX1 NA Body;Body;Body
general_trauma cg14282695 0.009 1.48e-08 SAMD4A NA Body;Body
general_trauma cg16543391 -0.001 6.25e-08 EML2;MIR330 N_Shelf TSS1500;TSS200;TSS200;Body;Body
Sexual Events
sexual_events cg21219607 -0.002 5.50e-08 PARP15 NA TSS1500;TSS1500;Body;Body
sexual_events cg06308131 -0.004 9.71e-10 MUC4 S_Shore Body;Body;Body
sexual_events cg04358942 -0.001 2.86e-08 RP11-64D24.2 NA
sexual_events cg23527517 -0.008 1.23e-08 OTOP3 NA Body
sexual_events cg14859642 0.007 6.99e-08 NA NA
sexual_events cg00489624 -0.003 1.10e-08 DLGAP4 NA 5’UTR
sexual_events cg24308336 -0.007 3.79e-08 ARHGAP40 N_Shelf Body
sexual_events cg16765764 -0.002 9.46e-09 DHX35 NA Body;Body;Body
sexual_events cg10897169 -0.003 6.81e-08 BCAS4 NA Body;Body;Body
War Stress
war_stress cg13740840 -0.002 5.99e-08 KIF15;MIR564;TMEM42 Island TSS1500;TSS200
war_stress cg26486174 0.013 1.61e-08 NA Island

Table 4 ewas hits babies

datatable(baby_all_sig2[c("exposure",
                          "probeID",
                          "coefficient",
                          "p_value",
                          "gene",
                          "CGIposition",
                          "gene_context")],
          filter = "top", rownames = FALSE, width = '100%',
          options = list(scrollX = TRUE),
          caption = htmltools::tags$caption(style = 'caption-side: top; text-align: center; color:black; font-size:200% ;','All significant hits in babies across four maternal stress EWAS'))
# Table of significant hits for mothers

baby_all_sig2 %>% 
select(exposure,probeID,coefficient,p_value,gene,CGIposition,gene_context) %>% 
arrange(exposure) %>% 
kbl() %>% 
  kable_styling(full_width = FALSE) %>% 
  kable_material() %>% 
  pack_rows(group_label = "General Trauma",1,2) %>% 
  pack_rows(group_label = "Sexual Events",3,8) %>% 
  pack_rows(group_label = "War Stress",9,11)
exposure probeID coefficient p_value gene CGIposition gene_context
General Trauma
general_trauma cg24590750 0.000 4.25e-08 TBPL1 Island TSS1500;TSS200
general_trauma cg10783680 0.000 6.23e-08 EXOC7 Island 5’UTR;1stExon;5’UTR;1stExon;5’UTR;1stExon;5’UTR;1stExon;1stExon;5’UTR;TSS200
Sexual Events
sexual_events cg10338475 0.006 4.74e-11 CTC-436P18.1;SMIM15 S_Shore TSS1500;Body
sexual_events cg11386818 0.002 3.97e-09 SYNCRIP Island TSS1500;5’UTR;5’UTR;5’UTR;TSS1500;5’UTR
sexual_events cg02176407 0.003 5.69e-08 POU3F2 Island 1stExon
sexual_events cg20807701 0.003 2.80e-11 DNAJB9;PNPLA8;THAP5 N_Shore Body;TSS1500;5’UTR;1stExon
sexual_events cg09631059 -0.005 8.48e-09 RP11-1391J7.1;TSPAN4 N_Shore 5’UTR;Body;Body;Body;Body;Body;Body
sexual_events cg06873316 0.001 4.03e-08 NELL1 Island TSS1500;TSS1500
War Stress
war_stress cg08985979 0.006 2.10e-08 AC108142.1 S_Shore Body
war_stress cg21172322 0.018 5.88e-10 BCAT1;RP11-662I13.3 N_Shore Body
war_stress cg00741900 0.007 2.98e-08 DIO3;DIO3OS;MIR1247 Island 5’UTR;1stExon;TSS1500

Figure 1 manhattan plots

library(patchwork)
# Print all manhattan plots
mom_gtsum_man + mom_setot_man + mom_awar_nr_man +
  mom_achronic_man + baby_gtsum_man + baby_setot_man + baby_awar_nr_man +
  baby_achronic_man + plot_layout(ncol = 2) + plot_annotation(tag_levels = "A")

overlap of top hits

within_stress_venn

between_stress_venn

top hits predicted values mothers

ggarrange(mom_gtsum_pred_plots$cg11408019,
          mom_gtsum_pred_plots$cg14519777,
          mom_gtsum_pred_plots$cg14282695,
          mom_gtsum_pred_plots$cg16543391,
          ncol = 2, nrow = 2)

ggarrange(mom_setot_pred_plots$cg21219607,
          mom_setot_pred_plots$cg06308131,
          mom_setot_pred_plots$cg04358942,
          mom_setot_pred_plots$cg23527517,
          mom_setot_pred_plots$cg14859642,
          mom_setot_pred_plots$cg00489624,
          mom_setot_pred_plots$cg24308336,
          mom_setot_pred_plots$cg16765764,
          mom_setot_pred_plots$cg10897169,
          ncol = 2, nrow = 5)

ggarrange(mom_awar_nr_pred_plots$cg13740840,
          mom_awar_nr_pred_plots$cg26486174,
          ncol = 2, nrow = 1)

Covariates controlled for in all analyses of mothers include: Body mass index, delivery mode (vaginal vs c-section), parity (yes vs no), alcohol use in pregnancy, age of the mother, gestational age of the baby, first principal component of cell type (which explains 89% of the variance in cell type for mothers), and cohort (c100 vs sv).

Covariates controlled for in all analyses of babies include: Maternal BMI, delivery mode (vaginal vs c-section), parity (yes vs no), maternal age, gestational age, infant sex, cohort (c100 or sv), and the first two principal components of cell type in babies, which explain 90% of the variance in cell type.


top hits predicted values babies

ggarrange(baby_gtsum_pred_plots$cg24590750,
          baby_gtsum_pred_plots$cg10783680,
          ncol = 2, nrow = 1)

ggarrange(baby_setot_pred_plots$cg10338475,
          baby_setot_pred_plots$cg11386818,
          baby_setot_pred_plots$cg02176407,
          baby_setot_pred_plots$cg20807701,
          baby_setot_pred_plots$cg09631059,
          baby_setot_pred_plots$cg06873316,
          ncol = 2, nrow = 3)

ggarrange(baby_awar_nr_pred_plots$cg08985979,
          baby_awar_nr_pred_plots$cg21172322,
          baby_awar_nr_pred_plots$cg00741900,
          ncol = 2, nrow = 2)

methylation and birthweight

Methylation and birthweight

meth_bwgt_model <- lm(bwgt ~ age + bmi + parous + pcsec +
                                     palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                                   cg08985979,
                                   data = dfb_sig_awar_nr)

nrow(model.frame(meth_bwgt_model))
## [1] 144
tidy(meth_bwgt_model)
## # A tibble: 12 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -3106.    1400.      -2.22  0.0282  
##  2 age             18.2      8.13     2.23  0.0273  
##  3 bmi             26.7     10.0      2.67  0.00850 
##  4 parous          69.7    107.       0.654 0.514   
##  5 pcsec          157.      87.1      1.80  0.0746  
##  6 palco          -89.7    102.      -0.876 0.383   
##  7 cohortSV       -86.7     97.6     -0.888 0.376   
##  8 PC1_cells     -167.     260.      -0.643 0.521   
##  9 PC2_cells     -375.     379.      -0.990 0.324   
## 10 sexM           185.      71.3      2.59  0.0108  
## 11 ga_meth         19.7      4.96     3.98  0.000113
## 12 cg08985979   -4770.    2239.      -2.13  0.0350
cg08985979_pred <- prediction(lm(bwgt ~ age + bmi + parous + pcsec +
                                     palco + cohort + PC1_cells + PC2_cells + sex + ga_meth +
                                   cg08985979,
                                   data = dfb_sig_awar_nr))




# Make another version:

label1 <- expression(bold("b = -4770.2, p = 0.03"))

cg08985979_pred_2 <- 
    ggplot(cg08985979_pred, aes(x = cg08985979, y = fitted)) +
    geom_point(shape = 20, alpha = 2/3) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    ylab("Adj. birthweight\n (grams)") +
    xlab("Methylation at cg08985979") +
   # ggtitle("Birthweight is associated with\n methylation at cg08985979 in newborns") +
    # annotate("text",x = 0.165, y = 3550,
    #        label = label1, 
    #        hjust = 0, 
    #        fontface = 2,
    #        size = 4) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(text=element_text(size=6.67))

cg08985979_pred_2

maternal stress and birthweight

War Stress had a significant negative association with birthweight (beta = -125.98798, p = 0.000695) in babies. Update this beta and p in text.

model_awar_nr <- lm(bwgt ~ bmi + age + pcsec + palco + 
                      parous + cohort + ga_meth + sex + awar_nr, data = dfb)

nrow(model.frame(model_awar_nr))
## [1] 144
tidy(model_awar_nr) # significant negative association.
## # A tibble: 10 × 5
##    term        estimate std.error statistic  p.value
##    <chr>          <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)  -2948.    1281.      -2.30  0.0229  
##  2 bmi             23.5      9.64     2.44  0.0162  
##  3 age             18.8      7.87     2.38  0.0185  
##  4 pcsec          227.      82.8      2.74  0.00702 
##  5 palco         -110.      99.0     -1.11  0.270   
##  6 parous          68.0    101.       0.670 0.504   
##  7 cohortSV      -111.      92.2     -1.21  0.229   
##  8 ga_meth         17.8      4.58     3.89  0.000157
##  9 sexM           197.      66.9      2.95  0.00375 
## 10 awar_nr       -126.      36.3     -3.47  0.000695
# Plot the raw data
ggplot(dfb, aes(x = awar_nr, y = bwgt, color = cohort)) +
  geom_point(alpha = 0.75) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_pubclean() +
  ggtitle("No significant interaction between cohort and stress")

awar_nr_bwgt_pred <- prediction(lm(bwgt ~ bmi + age + pcsec + 
                                   palco + parous + cohort + 
                                   ga_meth + sex + awar_nr, data = dfb))

set.seed(456)
awar_nr_bwgt_pred_plot <- 
    ggplot(awar_nr_bwgt_pred, aes(x = awar_nr, y = fitted)) +
    geom_jitter(shape = 20, alpha = 2/3, width = 0.15) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    ylab("Adj. birthweight\n (grams)") +
    xlab("War Trauma") +
   # ggtitle("Birthweight is associated with\n methylation at cg08985979 in newborns") +
    # annotate("text",x = 0.165, y = 3550,
    #        label = label1, 
    #        hjust = 0, 
    #        fontface = 2,
    #        size = 4) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(text=element_text(size=6.67))

awar_nr_bwgt_pred_plot

maternal stress and methylation

model_stress_meth <- rlm(cg08985979 ~ bmi + age + pcsec + palco + 
                      parous + cohort + ga_meth + sex + PC1_cells + 
                        PC2_cells + awar_nr, data = dfb_sig_awar_nr)
mod_sum_robust(model_stress_meth) 
## # A tibble: 12 × 7
##    term          estimate std.error statistic       p.value  conf.low conf.high
##    <chr>            <dbl>     <dbl>     <dbl>         <dbl>     <dbl>     <dbl>
##  1 (Intercept)  0.108      0.0465      2.32   0.0205         0.0166    0.199   
##  2 bmi          0.000224   0.000365    0.615  0.539         -0.000490  0.000939
##  3 age          0.0000708  0.000275    0.257  0.797         -0.000469  0.000611
##  4 pcsec       -0.00851    0.00280    -3.04   0.00236       -0.0140   -0.00303 
##  5 palco        0.00445    0.00331     1.35   0.179         -0.00203   0.0109  
##  6 parous       0.00536    0.00373     1.44   0.151         -0.00195   0.0127  
##  7 cohortSV     0.000359   0.00398     0.0903 0.928         -0.00744   0.00816 
##  8 ga_meth     -0.0000111  0.000174   -0.0634 0.949         -0.000353  0.000331
##  9 sexM        -0.000581   0.00250    -0.232  0.816         -0.00548   0.00432 
## 10 PC1_cells   -0.0179     0.00681    -2.63   0.00848       -0.0313   -0.00458 
## 11 PC2_cells   -0.0595     0.0100     -5.93   0.00000000302 -0.0792   -0.0399  
## 12 awar_nr      0.00598    0.00107     5.60   0.0000000210   0.00389   0.00807
nrow(model.frame(model_stress_meth))
## [1] 145
# Plot the raw data
ggplot(dfb_sig_awar_nr, aes(x = awar_nr, y = cg08985979, color = cohort)) +
  geom_point(alpha = 0.75) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_pubclean() +
  ggtitle("No significant interaction between cohort and stress")

stress_meth_pred <- prediction(rlm(cg08985979 ~ bmi + age + pcsec + palco + 
                      parous + cohort + ga_meth + sex + PC1_cells + 
                        PC2_cells + awar_nr, data = dfb_sig_awar_nr))

set.seed(456)
stress_meth_pred_plot <- 
    ggplot(stress_meth_pred, aes(x = awar_nr, y = fitted)) +
    geom_jitter(shape = 20, alpha = 2/3, width = 0.15) +
    geom_smooth(method = "lm", se = TRUE, color = "red", fill = "salmon") +
    theme_bw() +
    ylab("Adj. DNAm at\n cg08985979") +
    xlab("War Trauma") +
   # ggtitle("Birthweight is associated with\n methylation at cg08985979 in newborns") +
    # annotate("text",x = 0.165, y = 3550,
    #        label = label1, 
    #        hjust = 0, 
    #        fontface = 2,
    #        size = 4) +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(text=element_text(size=6.67))

stress_meth_pred_plot

maternal stress and top 10 methylation pcs

A single hit in mothers across 40 tests. General trauma associated with methylation PC6 (p=0.04).

maternal stress and GMM

ALL NEGATIVE RESULTS.

birthweight and GMM

ALL NEGATIVE RESULTS.

maternal stress and cell type distribution mothers

mom_cell_heatmap

Bcell = B cell. CD4T = CD4 positive T cell. CD8T = CD8 positive T cell. Mono = Monocyte. Neu = Neutrophil. NK = Natural Killer cell.

maternal stress and cell type distribution babies

baby_cell_heatmap

Bcell = B cell. CD4T = CD4 positive T cell. CD8T = CD8 positive T cell. Gran = Granulocyte. Mono = Monocyte. NK = Natural Killer cell. nRBC = nucleated Red Blood cell.

Figure 2

mom_eaa_heatmap

This heatmap depicts relationships between epigenetic age acceleration and maternal stress. The different epigenetic clocks listed on the y axis are: Horvath = Horvath’s original clock. IEAA = Intrinsic Epigenetic Age Acceleration. Essentially Horvath’s clock with a cell type correction. EEAA = Extrinsic Epigenetic Age Acceleration. This measure tries to take into account age-associated changes in immune cell type and capitalize on them to make a better measure of age acceleration. This measure depends on cell type by design. Telomere = An epigenetic marker of telomere length. PhenoAge = Predictive of mortality. GrimAge = Also predictive of mortality.

Epigenetic age analysis babies

baby_eaa_heatmap

epiage <- mom_eaa_heatmap %v% baby_eaa_heatmap

draw(epiage)

Epigenetic age plots

mom_setot_pheno_age_prediction <- prediction(lm(AgeAccelPheno ~ bmi + 
                                        pcsec + 
                                        palco + 
                                        parous + 
                                        cohort + 
                                        setot, data = mom_mage))


set.seed(456)
mom_setot_pheno_age_plot <- 
  ggplot(mom_setot_pheno_age_prediction, aes(x = setot, y = fitted, 
                                             fill = fitted,
                                             color = fitted)) +
  geom_jitter(shape = 21, width = 0.25, height = 0.05, 
              color = "black",
              size = 2.5, alpha = 0.75) +
  scale_fill_viridis_c(direction = -1, 
                       option = "magma",
                       name = "PhenoAge\nAcceleration") +
  scale_color_viridis_c(direction = -1, 
                       option = "magma",
                       name = "PhenoAge\nAcceleration") +
  scale_x_continuous(breaks = seq(0,6,1), labels = seq(0,6,1)) +
  theme_classic() +
  geom_smooth(method = "lm", color = "grey", fill = "grey") +
  xlab("Sexual Trauma in Mothers") +
  ylab("Adj. PhenoAge Accel") 


mom_setot_grim_age_prediction <- prediction(lm(AgeAccelGrim ~ bmi + 
                                        pcsec + 
                                        palco + 
                                        parous + 
                                        cohort + 
                                        setot, data = mom_mage))

set.seed(456)
mom_setot_grim_age_plot <- 
  ggplot(mom_setot_grim_age_prediction, aes(x = setot, y = fitted, 
                                            fill = fitted,
                                            color = fitted)) +
  geom_jitter(shape = 21, width = 0.25, height = 0.05, 
              color = "black",
              size = 3, alpha = 0.75) +
  scale_fill_viridis_c(direction = -1, 
                       option = "magma",
                       name = "GrimAge\nAcceleration") +
  scale_color_viridis_c(direction = -1, 
                       option = "magma",
                       name = "GrimAge\nAcceleration") +
  scale_x_continuous(breaks = seq(0,6,1), labels = seq(0,6,1)) +
  theme_classic() +
  geom_smooth(method = "lm", color = "grey", fill = "grey") +
  xlab("Sexual Trauma in Mothers") +
  ylab("Adj. GrimAge Accel") 


mom_setot_EEAA_prediction <- prediction(lm(EEAA ~ bmi + 
                                        pcsec + 
                                        palco + 
                                        parous + 
                                        cohort + 
                                        setot, data = mom_mage))

set.seed(456)
mom_setot_eeaa_age_plot <- 
  ggplot(mom_setot_EEAA_prediction, aes(x = setot, y = fitted, 
                                        fill = fitted,
                                        color = fitted)) +
  geom_jitter(shape = 21, width = 0.25, height = 0.05, 
              color = "black",
              size = 3, alpha = 0.75) +
  scale_fill_viridis_c(direction = -1, 
                       option = "magma",
                       name = "Extrinsic\nEpigenetic\nAge\nAcceleration") +
  scale_color_viridis_c(direction = -1, 
                       option = "magma",
                       name = "Extrinsic\nEpigenetic\nAge\nAcceleration") +
  scale_x_continuous(breaks = seq(0,6,1), labels = seq(0,6,1)) +
  theme_classic() +
  geom_smooth(method = "lm", color = "grey", fill = "grey") +
  xlab("Sexual Trauma in Mothers") +
  ylab("Adj. Extrinsic Accel") 


mom_setot_telomere_prediction <- prediction(lm(DNAmTLAdjAge ~ bmi + 
                                        pcsec + 
                                        palco + 
                                        parous + 
                                        cohort + 
                                        setot, data = mom_mage))

set.seed(456)
mom_setot_telomere_age_plot <- 
  ggplot(mom_setot_telomere_prediction, aes(x = setot, y = fitted, 
                                            fill = fitted,
                                            color = fitted)) +
  geom_jitter(shape = 21, width = 0.25, height = 0.05, 
              color = "black",
              size = 3, alpha = 0.75) +  
  scale_fill_viridis_c(direction = 1, 
                       option = "magma",
                       name = "Telomere\nLength") +
  scale_color_viridis_c(direction = 1, 
                       option = "magma",
                       name = "Telomere\nLength") +
  scale_x_continuous(breaks = seq(0,6,1), labels = seq(0,6,1)) +
  theme_classic() +
  geom_smooth(method = "lm", color = "grey", fill = "grey") +
  xlab("Sexual Trauma in Mothers") +
  ylab("Adj. Telomere Length") 
              

baby_gtsum_EEAA_plot <- prediction(lm(EEAA ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + gtsum, data = baby_mage))

set.seed(456)
baby_gtsum_eeaa_plot <- 
  ggplot(baby_gtsum_EEAA_plot, aes(x = gtsum, y = fitted, 
                                   fill = fitted,
                                   color = fitted)) +
  geom_jitter(shape = 21, width = 0.25, height = 0.05, 
              color = "black",
              size = 3, alpha = 0.75) +  
  scale_fill_viridis_c(direction = -1, 
                       option = "magma",
                       name = "Extrinsic\nEpigenetic\nAge\nAcceleration") +
  scale_color_viridis_c(direction = -1, 
                       option = "magma",
                       name = "Extrinsic\nEpigenetic\nAge\nAcceleration") +
  scale_x_continuous(breaks = seq(0,8,1), labels = seq(0,8,1)) +
  theme_classic() +
  geom_smooth(method = "lm", color = "grey", fill = "grey") +
  xlab("General Trauma in Newborns") +
  ylab("Adj. Extrinsic Accel") 


baby_awar_nr_EEAA_plot <- prediction(lm(EEAA ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + awar_nr, data = baby_mage))

set.seed(456)
baby_awar_nr_eeaa_plot <- 
  ggplot(baby_awar_nr_EEAA_plot, aes(x = awar_nr, y = fitted, 
                                       fill = fitted,
                                       color = fitted)) +
  geom_jitter(shape = 21, width = 0.25, height = 0.05, 
              color = "black",
              size = 3, alpha = 0.75) +  
  scale_fill_viridis_c(direction = -1, 
                       option = "magma",
                       name = "Extrinsic\nEpigenetic\nAge\nAcceleration") +
   scale_color_viridis_c(direction = -1, 
                       option = "magma",
                       name = "Extrinsic\nEpigenetic\nAge\nAcceleration") +
  scale_x_continuous(breaks = seq(0,8,1), labels = seq(0,8,1)) +
  theme_classic() +
  geom_smooth(method = "lm", color = "grey", fill = "grey") +
  xlab("War Trauma in Newborns") +
  ylab("Adj. Extrinsic Accel") 



addSmallLegend <- function(myPlot, pointSize = 0.5, textSize = 8, spaceLegend = 0.8) {
    myPlot +
        #guides(shape = guide_legend(override.aes = list(size = pointSize)),
        #       color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}

           
library(patchwork)           
  addSmallLegend(mom_setot_eeaa_age_plot) +
  addSmallLegend(mom_setot_telomere_age_plot) + 
  addSmallLegend(mom_setot_pheno_age_plot) + 
  addSmallLegend(mom_setot_grim_age_plot) + 
  addSmallLegend(baby_gtsum_eeaa_plot) + 
  addSmallLegend(baby_awar_nr_eeaa_plot) +
  plot_layout(ncol = 2) + 
  plot_annotation(tag_levels = "A")

# Get epigenetic age acceleration model parameters

summary(lm(AgeAccelPheno ~ bmi + pcsec +  palco + parous + cohort + setot, data = mom_mage))
## 
## Call:
## lm(formula = AgeAccelPheno ~ bmi + pcsec + palco + parous + cohort + 
##     setot, data = mom_mage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.692  -7.441   0.265   7.541  36.288 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.67928    8.00427  -0.959  0.33899    
## bmi          0.01278    0.27573   0.046  0.96308    
## pcsec       -3.41020    2.35950  -1.445  0.15058    
## palco       -1.02164    2.83899  -0.360  0.71949    
## parous      14.57064    2.43344   5.988 1.66e-08 ***
## cohortSV     7.79501    2.46472   3.163  0.00191 ** 
## setot        1.36186    0.60631   2.246  0.02624 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.56 on 142 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.4841, Adjusted R-squared:  0.4623 
## F-statistic: 22.21 on 6 and 142 DF,  p-value: < 2.2e-16
summary(lm(AgeAccelGrim ~ bmi + pcsec +  palco + parous + cohort + setot, data = mom_mage))
## 
## Call:
## lm(formula = AgeAccelGrim ~ bmi + pcsec + palco + parous + cohort + 
##     setot, data = mom_mage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8957 -2.0845  0.2629  1.9500  7.5035 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.70218    2.26262   1.194   0.2344    
## bmi         -0.16631    0.07794  -2.134   0.0346 *  
## pcsec       -0.00725    0.66698  -0.011   0.9913    
## palco       -0.21906    0.80252  -0.273   0.7853    
## parous       3.31421    0.68788   4.818 3.68e-06 ***
## cohortSV     1.03075    0.69672   1.479   0.1412    
## setot        0.42460    0.17139   2.477   0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.266 on 142 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.3933, Adjusted R-squared:  0.3677 
## F-statistic: 15.35 on 6 and 142 DF,  p-value: 1.646e-13
summary(lm(EEAA ~ bmi + pcsec +  palco + parous + cohort + setot, data = mom_mage))
## 
## Call:
## lm(formula = EEAA ~ bmi + pcsec + palco + parous + cohort + setot, 
##     data = mom_mage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.6705  -4.0579  -0.2834   3.7767  25.2586 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.8741     4.8832  -0.998 0.319910    
## bmi           0.0027     0.1682   0.016 0.987218    
## pcsec        -1.9847     1.4395  -1.379 0.170130    
## palco         1.0050     1.7320   0.580 0.562647    
## parous        7.5865     1.4846   5.110 1.02e-06 ***
## cohortSV      5.4545     1.5037   3.628 0.000398 ***
## setot         1.0865     0.3699   2.937 0.003864 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.049 on 142 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.473,  Adjusted R-squared:  0.4507 
## F-statistic: 21.24 on 6 and 142 DF,  p-value: < 2.2e-16
summary(lm(DNAmTLAdjAge ~ bmi + pcsec +  palco + parous + cohort + setot, data = mom_mage))
## 
## Call:
## lm(formula = DNAmTLAdjAge ~ bmi + pcsec + palco + parous + cohort + 
##     setot, data = mom_mage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51666 -0.10899 -0.00522  0.11513  0.52193 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.140668   0.123610   1.138 0.257034    
## bmi         -0.002119   0.004258  -0.498 0.619447    
## pcsec        0.046587   0.036438   1.279 0.203142    
## palco        0.029567   0.043842   0.674 0.501155    
## parous      -0.133149   0.037579  -3.543 0.000536 ***
## cohortSV    -0.128444   0.038063  -3.375 0.000954 ***
## setot       -0.021505   0.009363  -2.297 0.023093 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1784 on 142 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.3535, Adjusted R-squared:  0.3262 
## F-statistic: 12.94 on 6 and 142 DF,  p-value: 1.226e-11
summary(lm(EEAA ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + gtsum, data = baby_mage))
## 
## Call:
## lm(formula = EEAA ~ bmi + pcsec + palco + parous + cohort + sex + 
##     ga_meth + gtsum, data = baby_mage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2612  -3.7631   0.8857   3.9407  15.3441 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -62.687959  19.630883  -3.193  0.00175 **
## bmi          -0.004813   0.146732  -0.033  0.97388   
## pcsec        -1.213191   1.251190  -0.970  0.33395   
## palco        -0.649081   1.525693  -0.425  0.67119   
## parous       -1.164019   1.300273  -0.895  0.37226   
## cohortSV      3.307231   1.330686   2.485  0.01415 * 
## sexM         -0.499230   1.030665  -0.484  0.62890   
## ga_meth       0.207682   0.069349   2.995  0.00326 **
## gtsum         0.696777   0.305255   2.283  0.02401 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.082 on 136 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.1244, Adjusted R-squared:  0.0729 
## F-statistic: 2.415 on 8 and 136 DF,  p-value: 0.01803
summary(lm(EEAA ~ bmi + pcsec + palco + parous + cohort + 
                      sex + ga_meth + awar_nr, data = baby_mage))
## 
## Call:
## lm(formula = EEAA ~ bmi + pcsec + palco + parous + cohort + sex + 
##     ga_meth + awar_nr, data = baby_mage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7296  -3.5601   0.6236   4.1669  14.4941 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -65.13881   19.85300  -3.281  0.00131 **
## bmi           0.02099    0.14821   0.142  0.88758   
## pcsec        -1.46966    1.26671  -1.160  0.24799   
## palco        -0.71590    1.53141  -0.467  0.64091   
## parous       -1.23036    1.30770  -0.941  0.34845   
## cohortSV      3.45024    1.34259   2.570  0.01125 * 
## sexM         -0.45440    1.03451  -0.439  0.66118   
## ga_meth       0.21576    0.06989   3.087  0.00245 **
## awar_nr       1.12135    0.56137   1.998  0.04776 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.109 on 136 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.1168, Adjusted R-squared:  0.06482 
## F-statistic: 2.248 on 8 and 136 DF,  p-value: 0.02757

Final Figure

awar_nr_bwgt_pred_plot + 
  cg08985979_pred_2 + 
  stress_meth_pred_plot + 
  plot_layout(ncol = 1) +
  plot_annotation(tag_levels = list(c("A","B","C")))

awar_nr_bwgt_pred_plot + 
            theme_bw(base_size = 9) +
            theme(axis.title = element_text(face="bold")) +
            theme(axis.text = element_text(face="bold")) +

            cg08985979_pred_2 + 
            theme_bw(base_size = 9) +
            theme(axis.title = element_text(face="bold")) +
            theme(axis.text = element_text(face="bold")) +

            stress_meth_pred_plot + 
            theme_bw(base_size = 9) +
            theme(axis.title = element_text(face="bold")) +
            theme(axis.text = element_text(face="bold")) +

            plot_layout(nrow = 1) +
            plot_annotation(tag_levels = list(c("A","B","C")))

Probe Attrition

# This text is copied from above:

####################################################################
# Step 1 probe attrition final counts:

# initial masking of bad probes, snp probes, and non cg probes:
# 127157
# Additional masking of probes failing in all samples by preprocessing:
# 1081
# Additional probes set to NA because of zeroIntensity, that were
# not already marked NA in all samples by initial masking or preprocessing:
# 873.

# Total probes masked for every single person at this stage = 129111.

# In preparation for ComBat, all probes failing in more than 10%
# of samples were removed, inclusive of all those above. That step
# takes us from 866553 to 706987. That's a difference of 159566.
# subtracting 129,111 from 159,566 gives us the number of probes
# were masked because they failed in too many samples (but less 
# than failed in every single sample). 30,455.
####################################################################



# Mothers

mermaid("
graph TB

title[<u>Congo EWAS Probe Attrition in Mothers</u>]
title-->A
style title fill:#FFF,stroke:#FFF
linkStyle 0 stroke:#FFF,stroke-width:0;


A[Total probes on array n = 866,553] -->|remove non-CpG probes,off-target probes,and SNP sites, n = 127,157| B
B[n = 739,396] -->|remove probes that failed in preprocessing for all samples, n = 1,081| C
C[n = 738,315] -->|remove probes that had zero intensity in >10% of samples, n = 873| D
D[n = 737,442] --> |remove probes that failed in >10% of samples, n = 30,455|E
E[n = 706,987] --> |remove probes with Y chromosome annotation, n = 6|F
F[Final probe set n = 706,981] 
"
)
# Babies

mermaid("
graph TB

title[<u>Congo EWAS Probe Attrition in Babies</u>]
title-->A
style title fill:#FFF,stroke:#FFF
linkStyle 0 stroke:#FFF,stroke-width:0;


A[Total probes on array n = 866,553] -->|remove non-CpG probes,off-target probes,and SNP sites, n = 127,157| B
B[n = 739,396] -->|remove probes that failed in preprocessing for all samples, n = 1,081| C
C[n = 738,315] -->|remove probes that had zero intensity in >10% of samples, n = 873| D
D[n = 737,442] --> |remove probes that failed in >10% of samples, n = 30,455|E
E[n = 706,987] --> |remove X and Y chromosome probes, n = 15,120|F
F[Final probe set n = 691,867] 
"
)

Sample Attrition

mermaid("
graph TB

title[<u>Congo EWAS Sample Attrition </u>]
title-->A
style title fill:#FFF,stroke:#FFF
linkStyle 0 stroke:#FFF,stroke-width:0;


A[Total methylation files n = 496] -->|remove placenta samples, n = 24| B
B[n = 472] -->|remove cord blood samples, n = 4| C
C[n = 468] -->|remove twins and their mothers, n = 6| D
D[n = 462] --> |remove follow-up samples, n = 106|E
E[n = 356] --> |remove samples failing methylation qc and sex checks, n = 10|F
F[n = 346] --> |remove samples belonging to small batches, n = 23|G
G[n = 323] --> |remove samples that are technical replicates, n = 7|H
H[n = 316] --> |remove samples that are duplicate mothers or sibling samples, n = 14|I
I[Final data set n = 302. 151 mothers & 151 babies] 
"
)
mermaid("
graph TB

title[<u>Congo Epigenetic Age Sample Attrition </u>]
title-->A
style title fill:#FFF,stroke:#FFF
linkStyle 0 stroke:#FFF,stroke-width:0;


A[Total methylation files n = 496] -->|remove placenta samples, n = 24| B
B[n = 472] -->|remove cord blood samples, n = 4| C
C[n = 468] -->|remove twins and their mothers, n = 6| D
D[n = 462] --> |remove follow-up samples, n = 106|E
E[n = 356] --> |average 41 replicate samples from 17 participants, n = 24|F
F[n = 332] --> |remove samples failing methylation qc and meffil sex checks, n = 8|G
G[n = 324] --> |remove duplicate mom and sibling samples, n = 14|H
H[Final data set n = 310. 155 mothers and 155 newborns] 
"
)

Maternal Stress Table

# 
# # Making a single table for mothers by cohort and overall is fine.
# stress <- read_csv(here("data","maternal_stress_measures_20220112.csv")) 
# 
# pheno_stress <- dfm %>% 
#   left_join(stress, by = c("idno" = "idno"))
# 
# df3 <- df2 %>% 
#   left_join(pheno_stress, by = c("dyad" = "idno"))

df <- read.csv(here("data","survey_data_20220610.csv"), header=TRUE, stringsAsFactors = FALSE)


library(psych)


df$idno <- paste0(df$ID, df$no)
  
# Recode for unhappy marriage
df$happy_marriage <- ifelse(df$marr==0,NA,df$happy_marriage)
df$unhapmar <- ifelse(df$happy_marriage==1,0,1)
df$unhapmar[is.na(df$unhapmar)] <- 0

# Recode for general stress
df$genstress <- ifelse(df$majstress==1 | df$dailystress==1,1,0)
df$genstress[is.na(df$genstress)] <- 0

# Recode for victim of violence
df$vicviol <- ifelse(df$wvicc==1 | df$wvica==1,1,0)

# Reverse code variables in chronic stress scale
df$nohelp <- ifelse(df$phelp==1,0,1)
df$nochoice <- ifelse(df$choice_in_rep_decision==1,0,1)
df$foodinsuf <- ifelse(df$pfood==1,0,1)
df$notwantpreg <- ifelse(df$pwant==1,0,1)
df$noprenat <- ifelse(df$pprenatal_care==1,0,1)
df$trouble_pay_bills <- as.numeric(df$trouble_pay_bills)
df$notownhm <- ifelse(df$ownhm==1,0,1)
df$nowed <- ifelse(df$wedding==1,0,1)
df$no_loca_choice <- ifelse(df$choose_loca_birth==1,0,1)
df$ctalk <- ifelse(df$ctalk==1,0,1)
df$travalone <- ifelse(df$travel_with_someone_to_hospital==1,0,1)

# removed se4, preg_from_rape, raped_while_preg, your_birth_from_rape
# add in wrefug, wrefalone, and refug_bad since these are also war-related variables
awar_nr <- cbind.data.frame(df$wrefug,
                            df$wfkill,
                            df$wknap, 
                            df$Battle.at.home,
                            df$current_refugee,
                            df$wrefalone)
awar_nr <- awar_nr %>% replace(is.na(.),0)

# alpha(awar_nr, check.keys = T) # 0.35

# Chronic stress

achronic <- cbind.data.frame(df$nohelp,
                             df$sickness,
                             df$cried,
                             df$ccry,
                             df$genstress,
                             df$notownhm,
                             df$nochoice,
                             df$foodinsuf,
                             df$notwantpreg,
                             df$noprenat,
                             df$pafraid,
                             df$trouble_pay_bills,
                             df$pemoa,
                             df$pphysa,
                             df$nowed,
                             df$unhapmar)
achronic <- achronic %>% replace(is.na(.), 0)
#alpha(achronic, check.keys = T) # 0.84

# Sexual events

var <- c("se1", "se2", "se3", "se4", "se5", "se6")
se <- df[,var]
se <- se %>% replace(is.na(.),0)
# alpha(se, check.keys = TRUE) # 0.79

# General trauma

# gt5 is negatively correlated so removed
var <- c("gt1", "gt2", "gt3", "gt4", "gt6", "gt7", "gt8", "gt9", "gt10", "gt11")
gt <- df[,var]
gt <- gt %>% replace(is.na(.),0)
# alpha(gt, check.keys = TRUE) # 0.64; gt5 is negatively correlated




# Subset for participants in our data and then make frequency tables


df2 <- df %>% replace(is.na(.),0)

df2 <- df2[which(df2$idno %in% mom_mage$dyad),]

df2$Cohort <- ifelse(df2$ID=="C","General Maternity Ward","Sexual Violence Ward")

# get war trauma frequencies for combined cohorts

df2$war_refugee <- factor(df2$wrefug, levels = c(0,1), 
                     labels = c("No","Yes"))

label(df2$war_refugee) <- "War refugee"

df2$family_killed_war <- factor(df2$wfkill, levels = c(0,1),
                     labels = c("No","Yes"))

label(df2$family_killed_war) <- "Family member killed in war"
                            
df2$kidnapped <- factor(df2$wknap, levels = c(0,1),
                        labels = c("No","Yes"))

label(df2$kidnapped) <- "Kidnapped as a result of the war"

df2$soldiers_torment_village <- factor(df2$Battle.at.home, levels = c(0,1),
                                      labels = c("No","Yes"))

label(df2$soldiers_torment_village) <- "Soldiers tormented village"
                            
df2$current_refugee <- factor(df2$current_refugee, levels = c(0,1),
                             labels = c("No","Yes"))

label(df2$current_refugee) <- "Current refugee"

df2$refugee_alone <- factor(df2$wrefalone, levels = c(0,1),
                           labels = c("No","Yes"))

label(df2$refugee_alone) <- "Was alone as a war refugee"





# get chronic stress frequencies for combined cohorts

df2$no_help_at_home <- factor(df2$nohelp, levels = c(0,1),
                     labels = c("No","Yes")) # no help at home

label(df2$no_help_at_home) <- "No help"

df2$sick_in_pregnancy <- factor(df2$sickness, levels = c(0,1),
                                labels = c("No","Yes")) # sick in pregnancy

label(df2$sick_in_pregnancy) <- "Sickness in pregnancy"

df2$cried_in_pregnancy <- factor(df2$cried, levels = c(0,1),
                                 labels = c("No","Yes")) # did you cry in pregnancy

label(df2$cried_in_pregnancy) <- "Cried during pregnancy"
                             
df2$ashmed_to_cry <- factor(df2$ccry, levels = c(0,1),
                             labels = c("No","Yes")) # did you cope by crying

label(df2$ashmed_to_cry) <- "Ashamed to cry"
                            
df2$stress_in_pregnancy <- factor(df2$genstress, levels = c(0,1),
                                  labels = c("No","Yes")) # stress during pregnancy

label(df2$stress_in_pregnancy) <- "Presence of general stressors"

df2$did_not_own_home <- factor(df2$notownhm, levels = c(0,1),
                               labels = c("No","Yes")) # did not own home

label(df2$did_not_own_home) <- "Did not own home"

df2$no_choice_reproduction <- factor(df2$nochoice, levels = c(0,1),
                                     labels = c("No","Yes")) # no choice in reproductive decision

label(df2$no_choice_reproduction) <- "No choice in reproductive decision"
                             
df2$insufficient_food <- factor(df2$foodinsuf, levels = c(0,1),
                                labels = c("No","Yes")) # insufficient food

label(df2$insufficient_food) <- "Did not have enough food during pregnancy"

df2$did_not_want_pregnancy <- factor(df2$notwantpreg, levels = c(0,1),
                                     labels = c("No","Yes")) # did not want pregnancy

label(df2$did_not_want_pregnancy) <- "Did not want to get pregnant"

df2$no_prenatal_care <- factor(df2$noprenat, levels = c(0,1),
                               labels = c("No","Yes")) # no prenatal care

label(df2$no_prenatal_care) <- "No prenatal care"

df2$afraid_at_night <- factor(df2$pafraid, levels = c(0,1),
                              labels = c("No","Yes")) # afraid at night

label(df2$afraid_at_night) <- "Afraid at night"

df2$trouble_paying_bills <- factor(df2$trouble_pay_bills, levels = c(0,1),
                                   labels = c("No","Yes")) # trouble paying bills

label(df2$trouble_paying_bills) <- "Trouble paying bills"
                             
df2$emotionally_abused <- factor(df2$pemoa, levels = c(0,1),
                                  labels = c("No","Yes")) # emotional abuse

label(df2$emotionally_abused) <- "Emotionally abused"

df2$physically_abused <- factor(df2$pphysa, levels = c(0,1),
                                labels = c("No","Yes")) # physical abuse

label(df2$physically_abused) <- "Physically abused"
                             
df2$no_wedding <- factor(df2$nowed, levels = c(0,1),
                         labels = c("No","Yes")) # no wedding

label(df2$no_wedding) <- "Did not have a wedding"

df2$unhappy_marriage <- factor(df2$unhapmar, levels = c(0,1),
                               labels = c("No","Yes")) # unhappy marriage

label(df2$unhappy_marriage) <- "Unhappy marriage"




# get sexual events frequencies for both cohorts

df2$uncomfortable_touching <- factor(df2$se1, levels = c(0,1),
                                     labels = c("No","Yes"))

label(df2$uncomfortable_touching) <- "Uncomfortably touched in intimate parts"

df2$genital_rubbing <- factor(df2$se2, levels = c(0,1),
                              labels = c("No","Yes"))

label(df2$genital_rubbing) <- "Someone rubbed genitals against you"

df2$forced_to_touch_someone <- factor(df2$se3, levels = c(0,1),
                                      labels = c("No","Yes"))

label(df2$forced_to_touch_someone) <- "Forced to touch intimate parts"

df2$raped_penetrative <- factor(df2$se4, levels = c(0,1),
                                labels = c("No","Yes"))

label(df2$raped_penetrative) <- "Someone had genital sex against your will"

df2$raped_oral_sex <- factor(df2$se5, levels = c(0,1),
                             labels = c("No","Yes"))

label(df2$raped_oral_sex) <- "Forced to perform oral sex"

df2$coerced_kiss <- factor(df2$se6, levels = c(0,1),
                           labels = c("No", "Yes"))

label(df2$coerced_kiss) <- "Forced to kiss someone in a sexual way"







# get general trauma frequencies for both cohorts

df2$natural_disaster <- factor(df2$gt1, levels = c(0,1),
                               labels = c("No","Yes"))

label(df2$natural_disaster) <- "Exposed to life-threatening natural disaster"


df2$serious_accident <- factor(df2$gt2, levels = c(0,1),
                               labels = c("No","Yes"))

label(df2$serious_accident) <- "Involved in a serious accident"

df2$serious_injury_illness <- factor(df2$gt3, levels = c(0,1),
                                     labels = c("No","Yes"))

label(df2$serious_injury_illness) <- "Suffered a serious personal injury or illness"

df2$death_illness_caretaker <- factor(df2$gt4, levels = c(0,1),
                              labels = c("No","Yes"))

label(df2$death_illness_caretaker) <- "Death or serious illness/injury of parent or caretaker"


df2$death_illness_sibling <- factor(df2$gt6, levels = c(0,1),
                                    labels = c("No","Yes"))

label(df2$death_illness_sibling) <- "Death or serious illness/injury of a sibling"


df2$death_illness_friend <- factor(df2$gt7, levels = c(0,1),
                                   labels = c("No","Yes"))

label(df2$death_illness_friend) <- "Death or serious illness/injury of friend"


df2$witnessed_violence <- factor(df2$gt8, levels = c(0,1),
                                 labels = c("No","Yes"))

label(df2$witnessed_violence) <- "Witnessed violence towards others"

df2$family_mental_illness <- factor(df2$gt9, levels = c(0,1),
                                    labels = c("No","Yes"))

label(df2$family_mental_illness) <- "Family mental illness"

df2$caretaker_drugs <- factor(df2$gt10, levels = c(0,1),
                              labels = c("No","Yes"))

label(df2$caretaker_drugs) <- "Parents or caretakes who use drugs"

df2$witnessed_murder <- factor(df2$gt11, levels = c(0,1),
                               labels = c("No","Yes"))

label(df2$witnessed_murder) <- "Witnessed someone murdered"



table1(~ war_refugee +
         family_killed_war +
         kidnapped +
         soldiers_torment_village +
         current_refugee +
         refugee_alone | Cohort, data = df2, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=90)
Sexual Violence Ward
(N=65)
p Total
(N=155)
War refugee
No 26 (28.9%) 30 (46.2%) 0.029 56 (36.1%)
Yes 64 (71.1%) 35 (53.8%) 99 (63.9%)
Family member killed in war
No 82 (91.1%) 50 (76.9%) 0.021 132 (85.2%)
Yes 8 (8.9%) 15 (23.1%) 23 (14.8%)
Kidnapped as a result of the war
No 88 (97.8%) 62 (95.4%) 0.65 150 (96.8%)
Yes 2 (2.2%) 3 (4.6%) 5 (3.2%)
Soldiers tormented village
No 81 (90.0%) 59 (90.8%) 1 140 (90.3%)
Yes 9 (10.0%) 6 (9.2%) 15 (9.7%)
Current refugee
No 89 (98.9%) 63 (96.9%) 0.572 152 (98.1%)
Yes 1 (1.1%) 2 (3.1%) 3 (1.9%)
Was alone as a war refugee
No 80 (88.9%) 64 (98.5%) 0.026 144 (92.9%)
Yes 10 (11.1%) 1 (1.5%) 11 (7.1%)
table1(~ no_help_at_home +
         sick_in_pregnancy +
         cried_in_pregnancy +
         ashmed_to_cry +
         stress_in_pregnancy +
         did_not_own_home +
         no_choice_reproduction +
         insufficient_food +
         did_not_want_pregnancy +
         no_prenatal_care +
         afraid_at_night +
         trouble_paying_bills +
         emotionally_abused +
         physically_abused +
         no_wedding +
         unhappy_marriage | Cohort, data = df2, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=90)
Sexual Violence Ward
(N=65)
p Total
(N=155)
No help
No 68 (75.6%) 48 (73.8%) 0.852 116 (74.8%)
Yes 22 (24.4%) 17 (26.2%) 39 (25.2%)
Sickness in pregnancy
No 83 (92.2%) 41 (63.1%) <0.001 124 (80.0%)
Yes 7 (7.8%) 24 (36.9%) 31 (20.0%)
Cried during pregnancy
No 37 (41.1%) 18 (27.7%) 0.092 55 (35.5%)
Yes 53 (58.9%) 47 (72.3%) 100 (64.5%)
Ashamed to cry
No 69 (76.7%) 29 (44.6%) <0.001 98 (63.2%)
Yes 21 (23.3%) 36 (55.4%) 57 (36.8%)
Presence of general stressors
No 48 (53.3%) 19 (29.2%) 0.003 67 (43.2%)
Yes 42 (46.7%) 46 (70.8%) 88 (56.8%)
Did not own home
No 25 (27.8%) 15 (23.1%) 0.579 40 (25.8%)
Yes 65 (72.2%) 50 (76.9%) 115 (74.2%)
No choice in reproductive decision
No 61 (67.8%) 8 (12.3%) <0.001 69 (44.5%)
Yes 29 (32.2%) 57 (87.7%) 86 (55.5%)
Did not have enough food during pregnancy
No 72 (80.0%) 39 (60.0%) 0.011 111 (71.6%)
Yes 18 (20.0%) 26 (40.0%) 44 (28.4%)
Did not want to get pregnant
No 62 (68.9%) 5 (7.7%) <0.001 67 (43.2%)
Yes 28 (31.1%) 60 (92.3%) 88 (56.8%)
No prenatal care
No 89 (98.9%) 61 (93.8%) 0.162 150 (96.8%)
Yes 1 (1.1%) 4 (6.2%) 5 (3.2%)
Afraid at night
No 68 (75.6%) 13 (20.0%) <0.001 81 (52.3%)
Yes 22 (24.4%) 52 (80.0%) 74 (47.7%)
Trouble paying bills
No 67 (74.4%) 22 (33.8%) <0.001 89 (57.4%)
Yes 23 (25.6%) 43 (66.2%) 66 (42.6%)
Emotionally abused
No 77 (85.6%) 52 (80.0%) 0.39 129 (83.2%)
Yes 13 (14.4%) 13 (20.0%) 26 (16.8%)
Physically abused
No 78 (86.7%) 56 (86.2%) 1 134 (86.5%)
Yes 12 (13.3%) 9 (13.8%) 21 (13.5%)
Did not have a wedding
No 44 (48.9%) 3 (4.6%) <0.001 47 (30.3%)
Yes 46 (51.1%) 62 (95.4%) 108 (69.7%)
Unhappy marriage
No 75 (83.3%) 63 (96.9%) 0.008 138 (89.0%)
Yes 15 (16.7%) 2 (3.1%) 17 (11.0%)
table1(~ uncomfortable_touching +
         genital_rubbing +
         forced_to_touch_someone +
         raped_penetrative +
         raped_oral_sex +
         coerced_kiss | Cohort, data = df2, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=90)
Sexual Violence Ward
(N=65)
p Total
(N=155)
Uncomfortably touched in intimate parts
No 72 (80.0%) 46 (70.8%) 0.189 118 (76.1%)
Yes 18 (20.0%) 19 (29.2%) 37 (23.9%)
Someone rubbed genitals against you
No 76 (84.4%) 49 (75.4%) 0.216 125 (80.6%)
Yes 14 (15.6%) 16 (24.6%) 30 (19.4%)
Forced to touch intimate parts
No 78 (86.7%) 56 (86.2%) 1 134 (86.5%)
Yes 12 (13.3%) 9 (13.8%) 21 (13.5%)
Someone had genital sex against your will
No 80 (88.9%) 8 (12.3%) <0.001 88 (56.8%)
Yes 10 (11.1%) 57 (87.7%) 67 (43.2%)
Forced to perform oral sex
No 83 (92.2%) 59 (90.8%) 0.776 142 (91.6%)
Yes 7 (7.8%) 6 (9.2%) 13 (8.4%)
Forced to kiss someone in a sexual way
No 65 (72.2%) 60 (92.3%) 0.002 125 (80.6%)
Yes 25 (27.8%) 5 (7.7%) 30 (19.4%)
table1(~ natural_disaster +
         serious_accident +
         serious_injury_illness +
         death_illness_caretaker +
         death_illness_sibling +
         death_illness_friend +
         witnessed_violence +
         family_mental_illness +
         caretaker_drugs +
         witnessed_murder | Cohort, data = df2, overall="Total",
    extra.col=list(`p`=pvalue_table), extra.col.pos=3)
General Maternity Ward
(N=90)
Sexual Violence Ward
(N=65)
p Total
(N=155)
Exposed to life-threatening natural disaster
No 40 (44.4%) 27 (41.5%) 0.745 67 (43.2%)
Yes 50 (55.6%) 38 (58.5%) 88 (56.8%)
Involved in a serious accident
No 69 (76.7%) 64 (98.5%) <0.001 133 (85.8%)
Yes 21 (23.3%) 1 (1.5%) 22 (14.2%)
Suffered a serious personal injury or illness
No 73 (81.1%) 59 (90.8%) 0.112 132 (85.2%)
Yes 17 (18.9%) 6 (9.2%) 23 (14.8%)
Death or serious illness/injury of parent or caretaker
No 59 (65.6%) 49 (75.4%) 0.218 108 (69.7%)
Yes 31 (34.4%) 16 (24.6%) 47 (30.3%)
Death or serious illness/injury of a sibling
No 82 (91.1%) 57 (87.7%) 0.595 139 (89.7%)
Yes 8 (8.9%) 8 (12.3%) 16 (10.3%)
Death or serious illness/injury of friend
No 85 (94.4%) 55 (84.6%) 0.054 140 (90.3%)
Yes 5 (5.6%) 10 (15.4%) 15 (9.7%)
Witnessed violence towards others
No 22 (24.4%) 8 (12.3%) 0.066 30 (19.4%)
Yes 68 (75.6%) 57 (87.7%) 125 (80.6%)
Family mental illness
No 82 (91.1%) 63 (96.9%) 0.194 145 (93.5%)
Yes 8 (8.9%) 2 (3.1%) 10 (6.5%)
Parents or caretakes who use drugs
No 82 (91.1%) 63 (96.9%) 0.194 145 (93.5%)
Yes 8 (8.9%) 2 (3.1%) 10 (6.5%)
Witnessed someone murdered
No 85 (94.4%) 61 (93.8%) 1 146 (94.2%)
Yes 5 (5.6%) 4 (6.2%) 9 (5.8%)

Nearest Genes

mom_all_sig3 <- zhou %>% 
  select(probeID,gene) %>% 
  right_join(mom_all_sig, by = c("probeID" = "probe")) %>% 
  mutate(coefficient = round(coef, digits = 3)) %>% 
  mutate(p_value = scientific(pval, digits = 3)) %>% 
  left_join(zhou2, by = c("probeID")) %>% 
  left_join(illumina2, by = c("probeID")) %>% 
  select(probeID,coefficient,p_value,exposure,
         generation,CpG_chrm,CpG_end,gene,CGIposition,distToTSS,gene_context) %>% 
  rename(chromosome=CpG_chrm,position = CpG_end) %>% 
  arrange(exposure)

baby_all_sig3 <- zhou %>% 
  select(probeID,gene) %>% 
  right_join(baby_all_sig, by = c("probeID" = "probe")) %>% 
  mutate(coefficient = round(coef, digits = 3)) %>% 
  mutate(p_value = scientific(pval, digits = 3)) %>% 
  left_join(zhou2, by = c("probeID")) %>% 
  left_join(illumina2, by = c("probeID")) %>% 
  select(probeID,coefficient,p_value,exposure,
         generation,CpG_chrm,CpG_end,gene,CGIposition,distToTSS,gene_context) %>% 
  rename(chromosome=CpG_chrm,position = CpG_end) %>% 
  arrange(exposure)


library(ACME) # to get nearest gene function

# Mothers

# General trauma
m_near_1 <- findClosestGene(mom_all_sig3$chromosome[1],mom_all_sig3$position[1],'hg38')
m_near_2 <- findClosestGene(mom_all_sig3$chromosome[2],mom_all_sig3$position[2],'hg38')
m_near_3 <- findClosestGene(mom_all_sig3$chromosome[3],mom_all_sig3$position[3],'hg38')
m_near_4 <- findClosestGene(mom_all_sig3$chromosome[4],mom_all_sig3$position[4],'hg38')
# Sexual Events
m_near_5 <- findClosestGene(mom_all_sig3$chromosome[5],mom_all_sig3$position[5],'hg38')
m_near_6 <- findClosestGene(mom_all_sig3$chromosome[6],mom_all_sig3$position[6],'hg38')
m_near_7 <- findClosestGene(mom_all_sig3$chromosome[7],mom_all_sig3$position[7],'hg38')
m_near_8 <- findClosestGene(mom_all_sig3$chromosome[8],mom_all_sig3$position[8],'hg38')
m_near_9 <- findClosestGene(mom_all_sig3$chromosome[9],mom_all_sig3$position[9],'hg38')
m_near_10 <- findClosestGene(mom_all_sig3$chromosome[10],mom_all_sig3$position[10],'hg38')
m_near_11 <- findClosestGene(mom_all_sig3$chromosome[11],mom_all_sig3$position[11],'hg38')
m_near_12 <- findClosestGene(mom_all_sig3$chromosome[12],mom_all_sig3$position[12],'hg38')
m_near_13 <- findClosestGene(mom_all_sig3$chromosome[13],mom_all_sig3$position[13],'hg38')
# War Trauma
m_near_14 <- findClosestGene(mom_all_sig3$chromosome[14],mom_all_sig3$position[14],'hg38')
m_near_15 <- findClosestGene(mom_all_sig3$chromosome[15],mom_all_sig3$position[15],'hg38')

m_nearest <- rbind(m_near_1,m_near_2,m_near_3,m_near_4,m_near_5,m_near_6,
                   m_near_7,m_near_8,m_near_9,m_near_10,m_near_11,m_near_12,
                   m_near_13,m_near_14,m_near_15)

m_nearest_2 <- m_nearest %>% 
  distinct(geneName, .keep_all = TRUE)
  


# Newborns

# General Trauma
b_near_1 <- findClosestGene(baby_all_sig3$chromosome[1],baby_all_sig3$position[1],'hg38')
b_near_2 <- findClosestGene(baby_all_sig3$chromosome[2],baby_all_sig3$position[2],'hg38')
# Sexual Events
b_near_3 <- findClosestGene(baby_all_sig3$chromosome[3],baby_all_sig3$position[3],'hg38')
b_near_4 <- findClosestGene(baby_all_sig3$chromosome[4],baby_all_sig3$position[4],'hg38')
b_near_5 <- findClosestGene(baby_all_sig3$chromosome[5],baby_all_sig3$position[5],'hg38')
b_near_6 <- findClosestGene(baby_all_sig3$chromosome[6],baby_all_sig3$position[6],'hg38')
b_near_7 <- findClosestGene(baby_all_sig3$chromosome[7],baby_all_sig3$position[7],'hg38')
b_near_8 <- findClosestGene(baby_all_sig3$chromosome[8],baby_all_sig3$position[8],'hg38')
# War Trauma
b_near_9 <- findClosestGene(baby_all_sig3$chromosome[9],baby_all_sig3$position[9],'hg38')
b_near_10 <- findClosestGene(baby_all_sig3$chromosome[10],baby_all_sig3$position[10],'hg38')
b_near_11 <- findClosestGene(baby_all_sig3$chromosome[11],baby_all_sig3$position[11],'hg38')


b_nearest <- rbind(b_near_1,b_near_2,b_near_3,b_near_4,b_near_5,b_near_6,
                   b_near_7,b_near_8,b_near_9,b_near_10,b_near_11)

b_nearest_2 <- b_nearest %>% 
  distinct(geneName, .keep_all = TRUE)

Make tables that list probes and their nearest gene information

mom_near <- cbind.data.frame(mom_all_sig3,m_nearest_2) %>% 
  relocate(geneName,.after = gene) %>% 
  rename(nearest_gene = geneName)



datatable(mom_near[c("exposure","probeID","coefficient","p_value","gene","nearest_gene","CGIposition",
                         "gene_context")],
          filter = "top", rownames = FALSE, width = '100%',
          options = list(scrollX = TRUE),
          caption = htmltools::tags$caption(style = 'caption-side: top; text-align: center; color:black; font-size:200% ;','All significant hits in mothers across four maternal stress EWAS'))
# Table of significant hits for mothers

mom_near %>% 
select(exposure,probeID,coefficient,p_value,gene,nearest_gene,CGIposition,gene_context) %>% 
arrange(exposure) %>% 
kbl() %>% 
  kable_styling(full_width = FALSE) %>% 
  kable_material() %>% 
  pack_rows(group_label = "General Trauma",1,4) %>% 
  pack_rows(group_label = "Sexual Events",5,13) %>% 
  pack_rows(group_label = "War Stress",14,15)
exposure probeID coefficient p_value gene nearest_gene CGIposition gene_context
General Trauma
general_trauma cg11408019 0.003 1.10e-10 NA RAD54L2 Island
general_trauma cg14519777 0.006 6.87e-08 CTA-339C12.1;CUX1 CUX1 NA Body;Body;Body
general_trauma cg14282695 0.009 1.48e-08 SAMD4A SAMD4A NA Body;Body
general_trauma cg16543391 -0.001 6.25e-08 EML2;MIR330 EML2 N_Shelf TSS1500;TSS200;TSS200;Body;Body
Sexual Events
sexual_events cg21219607 -0.002 5.50e-08 PARP15 PARP15 NA TSS1500;TSS1500;Body;Body
sexual_events cg06308131 -0.004 9.71e-10 MUC4 MUC20 S_Shore Body;Body;Body
sexual_events cg04358942 -0.001 2.86e-08 RP11-64D24.2 LOC101928940 NA
sexual_events cg23527517 -0.008 1.23e-08 OTOP3 OTOP3 NA Body
sexual_events cg14859642 0.007 6.99e-08 NA BPIFA2 NA
sexual_events cg00489624 -0.003 1.10e-08 DLGAP4 DLGAP4 NA 5’UTR
sexual_events cg24308336 -0.007 3.79e-08 ARHGAP40 ARHGAP40 N_Shelf Body
sexual_events cg16765764 -0.002 9.46e-09 DHX35 DHX35 NA Body;Body;Body
sexual_events cg10897169 -0.003 6.81e-08 BCAS4 ADNP NA Body;Body;Body
War Stress
war_stress cg13740840 -0.002 5.99e-08 KIF15;MIR564;TMEM42 MIR564 Island TSS1500;TSS200
war_stress cg26486174 0.013 1.61e-08 NA HMGA1 Island
baby_near <- cbind.data.frame(baby_all_sig3,b_nearest_2) %>% 
  relocate(geneName,.after = gene) %>% 
  rename(nearest_gene = geneName)



datatable(baby_near[c("exposure",
                          "probeID",
                          "coefficient",
                          "p_value",
                          "gene",
                          "nearest_gene",
                          "CGIposition",
                          "gene_context")],
          filter = "top", rownames = FALSE, width = '100%',
          options = list(scrollX = TRUE),
          caption = htmltools::tags$caption(style = 'caption-side: top; text-align: center; color:black; font-size:200% ;','All significant hits in babies across four maternal stress EWAS'))
# Table of significant hits for mothers

baby_near %>% 
select(exposure,probeID,coefficient,p_value,gene,nearest_gene,CGIposition,gene_context) %>% 
arrange(exposure) %>% 
kbl() %>% 
  kable_styling(full_width = FALSE) %>% 
  kable_material() %>% 
  pack_rows(group_label = "General Trauma",1,2) %>% 
  pack_rows(group_label = "Sexual Events",3,8) %>% 
  pack_rows(group_label = "War Stress",9,11)
exposure probeID coefficient p_value gene nearest_gene CGIposition gene_context
General Trauma
general_trauma cg24590750 0.000 4.25e-08 TBPL1 TBPL1 Island TSS1500;TSS200
general_trauma cg10783680 0.000 6.23e-08 EXOC7 EXOC7 Island 5’UTR;1stExon;5’UTR;1stExon;5’UTR;1stExon;5’UTR;1stExon;1stExon;5’UTR;TSS200
Sexual Events
sexual_events cg10338475 0.006 4.74e-11 CTC-436P18.1;SMIM15 SMIM15 S_Shore TSS1500;Body
sexual_events cg11386818 0.002 3.97e-09 SYNCRIP SYNCRIP Island TSS1500;5’UTR;5’UTR;5’UTR;TSS1500;5’UTR
sexual_events cg02176407 0.003 5.69e-08 POU3F2 POU3F2 Island 1stExon
sexual_events cg20807701 0.003 2.80e-11 DNAJB9;PNPLA8;THAP5 THAP5 N_Shore Body;TSS1500;5’UTR;1stExon
sexual_events cg09631059 -0.005 8.48e-09 RP11-1391J7.1;TSPAN4 TSPAN4 N_Shore 5’UTR;Body;Body;Body;Body;Body;Body
sexual_events cg06873316 0.001 4.03e-08 NELL1 NELL1 Island TSS1500;TSS1500
War Stress
war_stress cg08985979 0.006 2.10e-08 AC108142.1 TENM3-AS1 S_Shore Body
war_stress cg21172322 0.018 5.88e-10 BCAT1;RP11-662I13.3 BCAT1 N_Shore Body
war_stress cg00741900 0.007 2.98e-08 DIO3;DIO3OS;MIR1247 DIO3 Island 5’UTR;1stExon;TSS1500

Gene Expression Omnibus

GEO Deposition Part 1

For epigenetic age analyses, processed data are in the noobBetas object but can be regenerated here:

# rearrange the object to comply with formatting for GEO deposition:

noobBetasGEO <- noobBetas[,c(ncol(noobBetas),1:(ncol(noobBetas)-1))]
colnames(noobBetasGEO)[1] <- "ID_REF"

rm(noobBetas)

# add in the detection p-values below

For epigenetic age analyses. Get raw data here:

if(file.exists(here("data", "congo_mothers_and_babies_eaa_raw_GEO.rds"))) {

raw_eaa_GEO <- readRDS(file = here("data", "congo_mothers_and_babies_eaa_raw_GEO.rds"))

} else {
 
 cl <- makeCluster(16)
 registerDoParallel(cl)
 clusterExport(cl, c("readIDATpair","noob","getBetas","pOOBAH"))


 raw_eaa_GEO <- parLapply(cl,df5$Basename, function(pfx) {
   readIDATpair(pfx)
   
   
 })

 stopCluster(cl)

 names(raw_eaa_GEO) <- df5$methylation_id

 saveRDS(raw_eaa_GEO, file = here("data","congo_mothers_and_babies_eaa_raw_GEO.rds"))

}

For epigenetic age analyses. Get detection p values here:

if(file.exists(here("data", "congo_mothers_and_babies_eaa_raw_pvals_GEO.rds"))) {

pvals <- readRDS(file = here("data", "congo_mothers_and_babies_eaa_raw_pvals_GEO.rds"))

} else {

# How to get the detection p-values
cl <- makeCluster(16)
 registerDoParallel(cl)
 clusterExport(cl, c("pOOBAH"))


 pvals <- do.call(cbind,parLapply(cl,raw_eaa_GEO[1:356], function(x) {
   pOOBAH(x, return.pval = TRUE)
   
 }))

 stopCluster(cl)
 
 

 saveRDS(pvals, file = here("data","congo_mothers_and_babies_eaa_raw_pvals_GEO.rds"))
 
}

pvals <- as.data.frame(pvals)

pvals$ID_REF <- rownames(pvals)

Now combine the processed betas with the detection p-values. Write a .gz compressed .csv file with the required formatting.

detection_pvals <- pvals

colnames(detection_pvals)[-ncol(detection_pvals)] <- 
  paste(colnames(pvals)[-ncol(detection_pvals)], "Detection Pval", sep = " ")

processed <- merge(noobBetasGEO,detection_pvals, by = "ID_REF")

processed <- processed[,order(colnames(processed))]

# Make ID_REF the first column to appear
processed <- processed %>%
  relocate(ID_REF)


# fwrite(processed,
#           file = here("output","geo_processed_matrix_v1.csv.gz"),
#           row.names = FALSE)

Now format the raw data and combine with the detection p-values

# Select the appropriate signal intensity values for Type I
# and Type II Infinium probes respectively.

unprocessed <- lapply(raw_eaa_GEO, function(x) {
  
  x <- x %>% 
    mutate(methylated = case_when(
      col == "2" ~ UG,
      col == "G" ~ MG,
      col == "R" ~ MR)) %>% 
    mutate(unmethylated = case_when(
      col == "2" ~ UR,
      col == "G" ~ UG,
      col == "R" ~ UR)) %>% 
    select(Probe_ID,methylated,unmethylated) %>% 
    rename(ID_REF = Probe_ID,
           "Methylated signal" = methylated,
           "Unmethylated signal" = unmethylated)
})

# Change the column names to match formatting requirements
unprocessed <- imap(unprocessed, ~rename_all(.x, function(z) paste(.y, z, sep = " "))) %>%
  bind_cols() %>% 
  rename(ID_REF = 1) %>% 
  select(-contains(" ID_REF"))
  
  
# Now combine with the detection p-values

unprocessed_detect <- unprocessed %>% 
  left_join(detection_pvals, by = c("ID_REF"))
  


# Then reorder the columns as prescribed by GEO
unprocessed_detect <- unprocessed_detect %>% 
  select(order(colnames(unprocessed_detect),decreasing = TRUE)) %>% 
  relocate(ID_REF, everything())

# fwrite(unprocessed_detect,
#           file = here("output","geo_matrix_signal_v1.csv.gz"),
#           row.names = FALSE)

For Epigenetic age analyses. Get metadata sheet here:

eaa_geo_metadata <- df5 %>% 
  mutate(title = paste("genomic DNA from venous blood",
                       1:nrow(df5),
                       sep = " "),
         "source name" = paste("venous blood",
                               1:nrow(df5),
                               sep = " "),
         organism = c("Homo sapiens"),
         "idat file 1" = paste(str_sub(Basename,-19,-1),
                               "_Grn.idat",
                               sep = ""),
         "idat file 2" = paste(str_sub(Basename,-19,-1),
                               "_Red.idat",
                               sep = ""),
         "characteristics: gender" = if_else(sex == "M","Male","Female"),
         molecule = c("genomic DNA"),
         label = c("Cy5 and Cy3"),
         description = c("Normal venous blood sample"),
         platform = c("GPL21145"),
         maternal_age = age,
         Age = ifelse(tissue == "baby_venous_blood",0,age))


eaa_geo_metadata <- mage %>% 
  select(methylation_id,DNAmAge,AgeAccelerationResidual) %>% 
  right_join(eaa_geo_metadata, by = c("methylation_id"))



eaa_geo_metadata$Delivery_Mode <- factor(eaa_geo_metadata$pcsec, levels = c(0,1),
                            labels = c("vaginal","caesarean section"))

eaa_geo_metadata$Alcohol <- factor(eaa_geo_metadata$palco, levels = c(0,1),
                            labels = c("No","Yes"))

eaa_geo_metadata$Parity <- factor(eaa_geo_metadata$is_this_your_first_child, 
                                  levels = c(0,1),
                     labels = c("Multigravida","Primigravida"))

eaa_geo_metadata$Gestational_Age <- eaa_geo_metadata$ga_meth/7

eaa_geo_metadata$General_Trauma <- eaa_geo_metadata$gtsum
eaa_geo_metadata$Sexual_Events <- eaa_geo_metadata$setot
eaa_geo_metadata$War_Stress <- eaa_geo_metadata$awar_nr
eaa_geo_metadata$Chronic_Stress <- eaa_geo_metadata$achronic

eaa_geo_metadata$Cohort <- factor(eaa_geo_metadata$cohort, levels = c("C","SV"),
                     labels = c("General Maternity Ward",
                                "Sexual Violence Ward"))

eaa_geo_metadata$bmi = eaa_geo_metadata$mwgt/((eaa_geo_metadata$mhgt/100)^2)



eaa_geo_metadata <- eaa_geo_metadata %>% 
  rename('characteristics: birthweight' = bwgt,
         'characteristics: maternal bmi' = bmi,
         'characteristics: age' = Age,
         'characteristics: dyad' = dyad,
         'characteristics: maternal age' = maternal_age,
         'characteristics: delivery mode' = Delivery_Mode,
         'characteristics: maternal alcohol' = Alcohol,
         'characteristics: parity' = Parity,
         'characteristics: gestational age' = Gestational_Age,
         'characteristics: general trauma' = General_Trauma,
         'characteristics: sexual trauma' = Sexual_Events,
         'characteristics: war trauma' = War_Stress,
         'characteristics: chronic stress' = Chronic_Stress,
         'characteristics: cohort' = Cohort,
         'characteristics: tissue' = tissue,
         'characteristics: replicate' = replicate,
         'characteristics: replicate id' = replicate_id,
         'characteristics: dna methylation age' = DNAmAge,
         'characteristics: age acceleration' = AgeAccelerationResidual) %>% 
  select(methylation_id,title,'source name', organism,
         'idat file 1','idat file 2',
         'characteristics: gender',
         'characteristics: tissue',
         'characteristics: age',
         'characteristics: birthweight',
         'characteristics: dyad',
         'characteristics: maternal age',
         'characteristics: delivery mode',
         'characteristics: maternal alcohol',
         'characteristics: parity',
         'characteristics: gestational age',
         'characteristics: general trauma',
         'characteristics: sexual trauma',
         'characteristics: war trauma',
         'characteristics: chronic stress',
         'characteristics: cohort',
         'characteristics: replicate',
         'characteristics: replicate id',
         'characteristics: dna methylation age',
         'characteristics: age acceleration',
         molecule,label,
         description, platform)


fwrite(eaa_geo_metadata,
          file = here("output","geo_eaa_metadata_v1.csv"),
          row.names = FALSE)

GEO Deposition Part 2

For EWAS analyses. Get processed data from the “betas” object generated above and format to GEO requirements. Then add in the detection p values.

rm(processed)

betas$ID_REF <- rownames(betas)

ewas_processed <- merge(betas,detection_pvals, by = "ID_REF")

ewas_processed <- ewas_processed[,order(colnames(ewas_processed))]

# Make ID_REF the first column to appear
ewas_processed <- ewas_processed %>%
  relocate(ID_REF)


# fwrite(ewas_processed,
#           file = here("output","geo_ewas_processed_matrix_v1.csv.gz"),
#           row.names = FALSE)

For EWAS analyses. The raw data are the same as the raw data for epigenetic age analyses. Just write out another file and change the name so that the file is stored appropriately.

# fwrite(unprocessed_detect,
#           file = here("output","geo_ewas_matrix_signal_v1.csv.gz"),
#           row.names = FALSE)

the metadata for the ewas are the same as for the epigenetic age analyses.

fwrite(eaa_geo_metadata,
          file = here("output","geo_ewas_metadata_v1.csv"),
          row.names = FALSE)

Don’t forget to write out a codebook explaining column headers in the metadata sheet, which will be the same for both data sets.

sub_names <- colnames(eaa_geo_metadata)

df <- data.frame(matrix(ncol = 2, nrow = 29))
df$X1 <- sub_names
colnames(df) <- c("column header","description")

df$description <- c("unique sample id",
                    "unique title that describes the sample",
                    "briefly identify the biological material",
                    "scientific name of organism from which the biological material was derived",
                    "the Green .idat file corresponding to the sample",
                    "the Red .idat file corresponding to the sample",
                    "the sex of the participant",
                    "describes whether the tissue is mother or newborn venous blood",
                    "age of the participant",
                    "birthweight of the baby in the dyad",
                    "the recruitment dyad to which the participant belongs",
                    "the age of the mother in the dyad",
                    "was the delivery vaginal or cesarean section",
                    "did the mother drink alcohol during pregnancy",
                    "is the newborn the mother's first child",
                    "gestational age in weeks estimated using DNA methylation data",
                    "score based on the general trauma section of the early trauma inventory short form",
                    "score based on the sexual abuse section of the early trauma inventory short form",
                    "score based on an ethnographic measure from a prior publication in Child Development by Darlene Kertes et al. 2016",
                    "score based on an ethnographic measure from a prior publication in Child Development by Darlene Kertes et al. 2016",
                    "was the participant recruited through the general maternity program or the sexual abuse survivor program",
                    "is this sample a replicate",
                    "a unique id assigned to a sample within replicate groups",
                    "dna methylation age of the participant",
                    "age acceleration of the participant",
                    "type of molecule that was extracted from the biological materials",
                    "compound used to label the extract",
                    "basic description of the sample",
                    "what platform were samples processed on")

fwrite(df,
          file = here("output","geo_column_headers_v1.csv"),
          row.names = FALSE)